diff --git a/examples/training/llama/finetune_llama.py b/examples/training/llama/finetune_llama.py
index 8daaa5a59..fe4095d0e 100755
--- a/examples/training/llama/finetune_llama.py
+++ b/examples/training/llama/finetune_llama.py
@@ -80,7 +80,7 @@ def train(model_id, tokenizer, dataset, training_args):
     args = training_args.to_dict()
 
     sft_config = NeuronSFTConfig(
-        max_seq_length=2048,
+        max_length=2048,
         packing=True,
         **args,
     )
@@ -91,7 +91,7 @@ def train(model_id, tokenizer, dataset, training_args):
         args=sft_config,
         model=model,
         peft_config=lora_config,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         train_dataset=dataset,
         formatting_func=lambda example: format_dolly(example, tokenizer),
     )
diff --git a/examples/training/qwen3/finetune_qwen3.py b/examples/training/qwen3/finetune_qwen3.py
index f7a27bbb6..8e3b25712 100644
--- a/examples/training/qwen3/finetune_qwen3.py
+++ b/examples/training/qwen3/finetune_qwen3.py
@@ -84,7 +84,7 @@ def train(model_id, tokenizer, dataset, training_args):
     args = training_args.to_dict()
 
     sft_config = NeuronSFTConfig(
-        max_seq_length=4096,
+        max_length=4096,
         packing=True,
         **args,
     )
@@ -98,7 +98,7 @@ def formatting_function(examples):
         args=sft_config,
         model=model,
         peft_config=lora_config,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         train_dataset=dataset,
         formatting_func=formatting_function,
     )
diff --git a/examples/training/qwen3/finetune_qwen3.sh b/examples/training/qwen3/finetune_qwen3.sh
index d64a6572d..b2d7568e3 100755
--- a/examples/training/qwen3/finetune_qwen3.sh
+++ b/examples/training/qwen3/finetune_qwen3.sh
@@ -13,7 +13,8 @@ TP_DEGREE=8
 BS=1
 GRADIENT_ACCUMULATION_STEPS=8
 LOGGING_STEPS=2
-MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name
+# MODEL_NAME="Qwen/Qwen3-8B" # Change this to the desired model name
+MODEL_NAME="Qwen/Qwen3-0.6B" # Change this to the desired model name
 OUTPUT_DIR="$(echo $MODEL_NAME | cut -d'/' -f2)-finetuned"
 DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE"
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
diff --git a/optimum/neuron/trainers/sft_config.py b/optimum/neuron/trainers/sft_config.py
index 8c50d033f..45cb07b3e 100644
--- a/optimum/neuron/trainers/sft_config.py
+++ b/optimum/neuron/trainers/sft_config.py
@@ -10,7 +10,7 @@
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# Seg the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
 # limitations under the License.
 
 from dataclasses import dataclass
@@ -32,4 +32,24 @@ def __init__(self, *args, **kwargs):
 
 @dataclass
 class NeuronSFTConfig(NeuronTrainingArguments, SFTConfig):
-    pass
+    """
+    Configuration class for Neuron-optimized SFT training.
+
+    Inherits from both NeuronTrainingArguments (for Trainium-specific settings) and
+    trl's SFTConfig (for SFT-specific settings).
+
+    Key Neuron-specific behavior:
+    - padding_free is always set to False to avoid recompilation on Trainium devices
+    - All other SFT parameters from trl 0.24.0+ are supported
+    """
+
+    def __post_init__(self):
+        # Handle max_seq_length -> max_length migration for backward compatibility
+        if hasattr(self, "max_seq_length") and self.max_seq_length is not None:
+            self.max_length = self.max_seq_length
+
+        # Force padding_free to False for Neuron - critical for avoiding recompilation
+        # Neuron devices require fixed input shapes; padding_free flattening breaks this requirement
+        self.padding_free = False
+
+        super().__post_init__()
diff --git a/optimum/neuron/trainers/sft_trainer.py b/optimum/neuron/trainers/sft_trainer.py
index c9a481bb4..29b9b6b2a 100644
--- a/optimum/neuron/trainers/sft_trainer.py
+++ b/optimum/neuron/trainers/sft_trainer.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from typing import Any, Callable
 
 import datasets
@@ -21,7 +22,6 @@
 from torch.utils.data import Dataset, IterableDataset
 from transformers import (
     AutoModelForCausalLM,
-    AutoTokenizer,
     DataCollator,
     DataCollatorForLanguageModeling,
     PreTrainedModel,
@@ -75,13 +75,18 @@ class PeftConfig:
 
 class NeuronSFTTrainer(_SFTTrainer):
     """
-    `SFTTrainer` adapted for Neuron.
-
-    It differs from the original `SFTTrainer` by:
-        - Using `_TrainerForNeuron.__init__()` instead of `Trainer.__init__()`
-        - Using the `_TrainerForNeuron.train()` instead of `Trainer.train()`
-        - Adapts the `_prepare_non_packed_dataloader` to pad to max length. In the original `SFTTrainer` examples are
-          not padded, which is an issue here because it triggers compilation every time.
+    `SFTTrainer` adapted for Neuron (Trainium) devices.
+
+    Overrides key methods for Neuron compatibility:
+        - Uses NeuronTrainer.__init__() instead of transformers.Trainer.__init__()
+        - Uses NeuronTrainer.train() for Neuron-optimized training
+        - Enforces padding_free=False for fixed input shapes (required for Trainium)
+        - Simplifies _prepare_dataset to delegate to parent with Neuron constraints
+
+    Neuron-specific constraints:
+        - padding_free is always False to avoid recompilation
+        - VLM training is not yet supported
+        - NeFTune training is not supported
     """
 
     def __init__(
@@ -91,33 +96,37 @@ def __init__(
         data_collator: DataCollator | None = None,  # type: ignore
         train_dataset: "Dataset | IterableDataset | datasets.Dataset | None" = None,
         eval_dataset: "Dataset | dict[str, Dataset] | datasets.Dataset | None" = None,
-        processsing_class: PreTrainedTokenizerBase | ProcessorMixin | None = None,
+        processing_class: PreTrainedTokenizerBase | ProcessorMixin | None = None,
+        compute_loss_func: Callable | None = None,
+        compute_metrics: Callable | None = None,
         callbacks: list[TrainerCallback] | None = None,
-        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None),
         optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]] | None = None,
-        tokenizer: PreTrainedTokenizerBase | None = None,  # deprecated
+        preprocess_logits_for_metrics: Callable | None = None,
         peft_config: PeftConfig | None = None,
         formatting_func: Callable | None = None,
+        # Deprecated parameters for backward compatibility
+        tokenizer: PreTrainedTokenizerBase | None = None,  # Use processing_class instead
     ):
         if not is_trl_available(required_version=TRL_VERSION):
             raise RuntimeError(f"Using NeuronSFTTrainer requires trl=={TRL_VERSION}.")
 
         from trl.extras.dataset_formatting import get_formatting_func_from_dataset
-
-        # This will be changed to :
         from trl.trainer.callbacks import RichProgressCallback
-        from trl.trainer.utils import (
-            DataCollatorForCompletionOnlyLM,
-            peft_module_casting_to_bf16,
-        )
+        from trl.trainer.utils import peft_module_casting_to_bf16
 
         if is_peft_available():
             from peft import PeftConfig
 
+        # Handle backward compatibility for tokenizer parameter
+        if tokenizer is not None and processing_class is None:
+            processing_class = tokenizer
+
         args_is_none = args is None
         if args is None:
-            output_dir = "tmp_trainer"
-            args = NeuronSFTConfig(output_dir=output_dir)
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = NeuronSFTConfig(f"{model_name}-SFT")
         elif args is not None and args.__class__.__name__ == "NeuronTrainingArguments":
             args_as_dict = args.to_dict()
             # Manually copy token values as TrainingArguments.to_dict() redacts them
@@ -132,7 +141,8 @@ def __init__(
         if args_is_none:
             logging.warning(f"No `SFTConfig` passed, using `output_dir={args.output_dir}`.")
 
-        if getattr(args, "model_init_kwargs", None) is None:
+        # Model handling - use model_init_kwargs from args
+        if args.model_init_kwargs is None:
             model_init_kwargs = {}
         elif not isinstance(model, str):
             raise ValueError("You passed model_init_kwargs to the SFTConfig, but your model is already instantiated.")
@@ -150,16 +160,51 @@ def __init__(
                 model_init_kwargs["dtype"] = torch_dtype
 
         if isinstance(model, str):
-            logging.warning(
-                "You passed a model_id to the SFTTrainer. This will automatically create an "
-                "`AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you."
-            )
-            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+            model_id = model
+            dtype = model_init_kwargs.get("dtype")
+            if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+                pass  # dtype is already a torch.dtype or "auto" or None
+            elif isinstance(dtype, str) and dtype in ["bfloat16", "float16", "float32"]:
+                dtype = getattr(torch, dtype)
+                model_init_kwargs["dtype"] = dtype
+            else:
+                raise ValueError(
+                    "Invalid `dtype` passed to `SFTConfig`. Expected either 'auto' or a string representing "
+                    f"a valid `torch.dtype` (e.g., 'float32'), but got {dtype}."
+                )
+            model = AutoModelForCausalLM.from_pretrained(model_id, **model_init_kwargs)
+        else:
+            model_id = model.config._name_or_path
+            if args.model_init_kwargs is not None:
+                logger.warning(
+                    "You passed `model_init_kwargs` to the `SFTConfig`, but your model is already instantiated. "
+                    "The `model_init_kwargs` will be ignored."
+                )
 
-        if args.packing and data_collator is not None and isinstance(data_collator, DataCollatorForCompletionOnlyLM):
-            raise ValueError(
-                "You passed a `DataCollatorForCompletionOnlyLM` to the NeuronSFTTrainer. This is not compatible with the `packing` argument."
-            )
+        # Chat template handling (trl 0.24.0+)
+        # This allows users to provide a custom chat template via path or directory
+        if hasattr(args, 'chat_template_path') and args.chat_template_path is not None:
+            from trl.models import clone_chat_template
+
+            if os.path.isfile(args.chat_template_path) and args.chat_template_path.endswith((".jinja", ".j2")):
+                # Load Jinja template directly
+                with open(args.chat_template_path, encoding="utf-8") as chat_template_file:
+                    processing_class.chat_template = chat_template_file.read()
+                added_tokens = []
+            else:
+                # Clone template from another model
+                try:
+                    model, processing_class, added_tokens = clone_chat_template(
+                        model, processing_class, args.chat_template_path
+                    )
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to clone chat template from {args.chat_template_path}: {e}. "
+                        "Continuing without custom chat template."
+                    )
+                    added_tokens = []
+        else:
+            added_tokens = []
 
         if is_peft_available() and peft_config is not None:
             if not isinstance(peft_config, PeftConfig):
@@ -188,24 +233,31 @@ def make_inputs_require_grad(module, input, output):
                 if args is not None and args.bf16:
                     peft_module_casting_to_bf16(model)
 
-        if tokenizer is None:
-            tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
-            if getattr(tokenizer, "pad_token", None) is None:
-                tokenizer.pad_token = tokenizer.eos_token
+        # Processing class (tokenizer) handling
+        if processing_class is None:
+            from transformers import AutoProcessor
+
+            processing_class = AutoProcessor.from_pretrained(model_id)
 
-        if args.max_seq_length is None:
-            # to overcome some issues with broken tokenizers
-            args.max_seq_length = min(tokenizer.model_max_length, 1024)
+        # Ensure we have a pad token
+        if hasattr(processing_class, "pad_token") and getattr(processing_class, "pad_token", None) is None:
+            processing_class.pad_token = processing_class.eos_token
+
+        if args.max_length is None:
+            # To overcome some issues with broken tokenizers
+            args.max_length = min(processing_class.model_max_length, 1024)
 
             logger.warning(
-                f"You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to {args.max_seq_length}"
+                f"You didn't pass a `max_length` argument to the SFTTrainer, this will default to {args.max_length}"
             )
 
         self.dataset_num_proc = args.dataset_num_proc
 
-        self.dataset_batch_size = args.dataset_batch_size
+        # We do not support NeFTune with NeuronSFTTrainer for now.
+        self._trainer_supports_neftune = False
 
-        self._trainer_supports_neftune = hasattr(args, "neftune_noise_alpha")
+        # Vision Language Model (VLM) support - not yet supported in Neuron
+        self._is_vlm = False
 
         if args.dataset_kwargs is None:
             args.dataset_kwargs = {}
@@ -230,50 +282,61 @@ def make_inputs_require_grad(module, input, output):
                     "You passed `packing=False` to the SFTTrainer/SFTConfig, but you didn't pass a `dataset_text_field` or `formatting_func` argument."
                 )
 
+            # Data collator creation with Neuron-specific constraints
             if data_collator is None:
-                data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+                # Determine if this is a VLM (vision language model)
+                is_vlm = isinstance(processing_class, ProcessorMixin) and hasattr(processing_class, 'image_processor')
+
+                if is_vlm:
+                    # VLM support is not yet implemented in Neuron
+                    logger.warning(
+                        "Vision Language Model (VLM) detected. VLM training is not yet fully supported in Neuron. "
+                        "Attempting to use standard language modeling collator."
+                    )
+                    # For now, use standard collator - user can override if needed
+                    data_collator = DataCollatorForLanguageModeling(
+                        tokenizer=processing_class.tokenizer if hasattr(processing_class, 'tokenizer') else processing_class,
+                        mlm=False,
+                    )
+                else:
+                    # Standard language modeling collator
+                    data_collator = DataCollatorForLanguageModeling(tokenizer=processing_class, mlm=False)
+
+            # Ensure padding_free is False - critical Neuron requirement
+            # (this is already done in NeuronSFTConfig.__post_init__, but double-check)
+            if hasattr(data_collator, 'padding_free'):
+                data_collator.padding_free = False
 
         # Pre-process the datasets only once per node. The remaining processes will use the cache.
         with NeuronPartialState().local_main_process_first():
             if train_dataset is not None:
                 train_dataset = self._prepare_dataset(
-                    train_dataset,
-                    tokenizer,
-                    args.packing,
-                    args.dataset_text_field,
-                    args.max_seq_length,
-                    formatting_func,
-                    args.num_of_sequences,
-                    args.chars_per_token,
-                    remove_unused_columns=args.remove_unused_columns if args is not None else True,
-                    **args.dataset_kwargs,
+                    train_dataset, processing_class, args, args.packing, formatting_func, "train"
                 )
             if eval_dataset is not None:
                 _multiple = isinstance(eval_dataset, dict)
                 _eval_datasets = eval_dataset if _multiple else {"singleton": eval_dataset}
 
-                eval_packing = args.packing if args.eval_packing is None else args.eval_packing
-
                 for _eval_dataset_name, _eval_dataset in _eval_datasets.items():
                     _eval_datasets[_eval_dataset_name] = self._prepare_dataset(
                         _eval_dataset,
-                        tokenizer,
-                        eval_packing,
-                        args.dataset_text_field,
-                        args.max_seq_length,
+                        processing_class,
+                        args,
+                        args.eval_packing if args.eval_packing is not None else args.packing,
                         formatting_func,
-                        args.num_of_sequences,
-                        args.chars_per_token,
-                        remove_unused_columns=args.remove_unused_columns if args is not None else True,
-                        **args.dataset_kwargs,
+                        _eval_dataset_name,
                     )
                 if not _multiple:
                     eval_dataset = _eval_datasets["singleton"]
 
-        if tokenizer.padding_side is not None and tokenizer.padding_side != "right":
+        if (
+            hasattr(processing_class, "padding_side")
+            and processing_class.padding_side is not None
+            and processing_class.padding_side != "right"
+        ):
             logger.warning(
-                "You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to "
-                "overflow issues when training a model in half-precision. You might consider adding `tokenizer.padding_side = 'right'` to your code."
+                "You passed a processing_class with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to "
+                'overflow issues when training a model in half-precision. You might consider adding `processing_class.padding_side = "right"` to your code.'
             )
 
         NeuronTrainer.__init__(
@@ -283,7 +346,7 @@ def make_inputs_require_grad(module, input, output):
             data_collator=data_collator,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
-            processing_class=tokenizer,
+            processing_class=processing_class,
             callbacks=callbacks,
             optimizers=optimizers,
             optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
@@ -313,62 +376,60 @@ def train(
     ):
         return NeuronTrainer.train(self, resume_from_checkpoint=resume_from_checkpoint)
 
-    def _prepare_non_packed_dataloader(
-        self,
-        tokenizer,
-        dataset,
-        dataset_text_field,
-        max_seq_length,
-        formatting_func=None,
-        add_special_tokens=True,
-        remove_unused_columns=True,
-    ):
-        use_formatting_func = formatting_func is not None and dataset_text_field is None
-        self._dataset_sanity_checked = False
-
-        # Inspired from: https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt
-        def tokenize(element):
-            outputs = tokenizer(
-                element[dataset_text_field] if not use_formatting_func else formatting_func(element),
-                add_special_tokens=add_special_tokens,
-                truncation=True,
-                # For Neuron we need to pad because otherwise it will trigger compilation for each new sequence length.
-                padding="max_length",
-                max_length=max_seq_length,
-                return_overflowing_tokens=False,
-                return_length=False,
-            )
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        """
+        Compute training loss for Neuron-optimized training.
 
-            if use_formatting_func and not self._dataset_sanity_checked:
-                if not isinstance(formatting_func(element), list):
-                    raise ValueError(
-                        "The `formatting_func` should return a list of processed strings since it can lead to silent bugs."
-                    )
-                else:
-                    self._dataset_sanity_checked = True
+        Overrides TRL SFTTrainer's compute_loss to set use_cache=False for gradient
+        checkpointing compatibility and delegate to NeuronTrainer's compute_loss.
+        """
+        # Set use_cache to False to avoid warnings with gradient checkpointing
+        inputs["use_cache"] = False
 
-            return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]}
+        # Call the parent NeuronTrainer's compute_loss method (not TRL's)
+        return NeuronTrainer.compute_loss(self, model, inputs, return_outputs, num_items_in_batch)
 
-        signature_columns = ["input_ids", "labels", "attention_mask"]
+    def training_step(
+        self, model: torch.nn.Module, inputs: dict[str, Any], num_items_in_batch: int | None = None
+    ) -> torch.Tensor:
+        """
+        Perform a training step for Neuron-optimized training.
 
-        if dataset.column_names is not None:  # None for IterableDataset
-            extra_columns = list(set(dataset.column_names) - set(signature_columns))
-        else:
-            extra_columns = []
+        Overrides SFTTrainer.training_step to delegate to NeuronTrainer's implementation,
+        which is compatible with Neuron's distributed training setup.
+        """
+        return NeuronTrainer.training_step(self, model, inputs, num_items_in_batch=num_items_in_batch)
 
-        if not remove_unused_columns and len(extra_columns) > 0:
-            logger.warning(
-                "You passed `remove_unused_columns=False` on a non-packed dataset. This might create some issues with the default collator and yield to errors. If you want to "
-                f"inspect dataset other columns (in this case {extra_columns}), you can subclass `DataCollatorForLanguageModeling` in case you used the default collator and create your own data collator in order to inspect the unused dataset columns."
+    def _prepare_dataset(
+        self,
+        dataset,
+        processing_class,
+        args,
+        packing,
+        formatting_func=None,
+        dataset_name="train",
+    ):
+        """
+        Prepare dataset for Neuron training.
+
+        Delegates to parent SFTTrainer._prepare_dataset, which handles:
+        - Dataset type detection (language modeling, prompt-completion, conversational)
+        - Chat template application
+        - Tokenization
+        - Packing (if enabled)
+
+        Neuron-specific behavior:
+        - Ensures padding_free=False to avoid recompilation
+        - Enforces padding to max_length for fixed input shapes
+        """
+        # Ensure padding_free is disabled for Neuron - this is critical for Trainium devices
+        if args.padding_free:
+            raise ValueError(
+                "padding_free must be False for Neuron training. "
+                "Neuron devices require fixed input shapes to avoid recompilation."
             )
 
-        map_kwargs = {
-            "batched": True,
-            "remove_columns": dataset.column_names if remove_unused_columns else None,
-            "batch_size": self.dataset_batch_size,
-        }
-        if isinstance(dataset, datasets.Dataset):
-            map_kwargs["num_proc"] = self.dataset_num_proc  # this arg is not available for IterableDataset
-        tokenized_dataset = dataset.map(tokenize, **map_kwargs)
-
-        return tokenized_dataset
+        # Call parent implementation from SFTTrainer
+        return super()._prepare_dataset(
+            dataset, processing_class, args, packing, formatting_func, dataset_name
+        )
diff --git a/optimum/neuron/trainers/transformers.py b/optimum/neuron/trainers/transformers.py
index 8e9ddcbf8..cfd009887 100644
--- a/optimum/neuron/trainers/transformers.py
+++ b/optimum/neuron/trainers/transformers.py
@@ -936,26 +936,29 @@ def get_batch_samples(
 
         return batch_samples, num_items_in_batch
 
-    def train_step(
-        self, model: nn.Module, inputs: dict[str, Any], num_items_in_batch: int | torch.Tensor | None = None
-    ) -> torch.Tensor:
-        manager = self.autocast_smart_context_manager()
-
+    def compute_loss(
+        self,
+        model: nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs: bool = False,
+        num_items_in_batch: torch.Tensor | None = None,
+    ):
         if isinstance(model, NxDPPModel):
-            with manager:
-                loss = model.run_train(**inputs)
+            loss = model.run_train(**inputs)
 
             # When using pipeline parallelism, the loss is only computed on the last stage.
             # So we set the loss to zero on other stages.
             if self.pp_rank != self.pp_size - 1:
                 dtype = torch.bfloat16 if self.args.bf16 else torch.float32
                 loss = torch.tensor(0, dtype=dtype).to(xm.xla_device())
+
+            # PP does not return any outputs except the loss
+            outputs = {"loss": loss}
         else:
             if num_items_in_batch is not None:
                 inputs = dict(**inputs, reduction="sum")
 
-            with manager:
-                outputs = model(**inputs)
+            outputs = model(**inputs)
 
             if isinstance(outputs, dict) and "loss" not in outputs:
                 raise ValueError(
@@ -970,8 +973,17 @@ def train_step(
             else:
                 loss = loss / self.args.gradient_accumulation_steps
 
-            # Backward pass
-            self.accelerator.backward(loss)
+        return (loss, outputs) if return_outputs else loss
+
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, Any], num_items_in_batch: int | torch.Tensor | None = None
+    ) -> torch.Tensor:
+        manager = self.autocast_smart_context_manager()
+        with manager:
+            loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
+
+        # Backward pass
+        self.accelerator.backward(loss)
 
         return loss
 
@@ -1102,7 +1114,7 @@ def train(
                     if step % args.gradient_accumulation_steps == 0:
                         self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
 
-                    loss_step = self.train_step(self.model, inputs, num_items_in_batch=num_items_in_batch)
+                    loss_step = self.training_step(self.model, inputs, num_items_in_batch=num_items_in_batch)
                     self.running_loss += loss_step.detach()
 
                     if do_sync_step:
diff --git a/optimum/neuron/trainers/trl_utils.py b/optimum/neuron/trainers/trl_utils.py
index dac42fb54..4046dd93b 100644
--- a/optimum/neuron/trainers/trl_utils.py
+++ b/optimum/neuron/trainers/trl_utils.py
@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-TRL_VERSION = "0.11.4"
+TRL_VERSION = "0.24.0"
diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py
index 56de4eb08..04e367429 100644
--- a/optimum/neuron/utils/__init__.py
+++ b/optimum/neuron/utils/__init__.py
@@ -85,7 +85,6 @@
         "patch_within_function",
         "replace_class_in_inheritance_hierarchy",
     ],
-    "trl_utils": ["NeuronSFTConfig", "NeuronORPOConfig"],
 }
 
 if TYPE_CHECKING:
@@ -155,7 +154,6 @@
         patch_within_function,
         replace_class_in_inheritance_hierarchy,
     )
-    from .trl_utils import NeuronORPOConfig, NeuronSFTConfig
 else:
     import sys
 
diff --git a/pyproject.toml b/pyproject.toml
index 9c1836bba..facb0eb40 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,7 +75,7 @@ quality = [
     "isort",
 ]
 training = [
-    "trl == 0.11.4",
+    "trl == 0.23.1",
     "peft == 0.17.0",
     "evaluate == 0.4.3",
 ]
diff --git a/tests/training/test_neuron_sft_trainer.py b/tests/training/test_neuron_sft_trainer.py
index dc4f9d15e..4be2a1a80 100644
--- a/tests/training/test_neuron_sft_trainer.py
+++ b/tests/training/test_neuron_sft_trainer.py
@@ -77,7 +77,7 @@ def format_dolly(sample):
         args = args.to_dict()
         sft_config = NeuronSFTConfig(
             # Using a small sequence-length since we are not validating the outputs.
-            max_seq_length=128,
+            max_length=128,
             packing=packing,
             dataset_num_proc=1,
             **args,
@@ -86,7 +86,7 @@ def format_dolly(sample):
         # Create Trainer instance
         trainer = NeuronSFTTrainer(
             model=model,
-            tokenizer=tokenizer,
+            processing_class=tokenizer,
             train_dataset=dataset,
             formatting_func=format_dolly,
             args=sft_config,
@@ -172,7 +172,7 @@ def format_dolly(sample):
 
     args = args.to_dict()
     sft_config = NeuronSFTConfig(
-        max_seq_length=128,
+        max_length=128,
         packing=False,  # No packing for PEFT test simplicity
         dataset_num_proc=1,
         **args,
@@ -181,7 +181,7 @@ def format_dolly(sample):
     # Create SFT Trainer instance with PEFT model
     trainer = NeuronSFTTrainer(
         model=base_model,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         train_dataset=dataset,
         formatting_func=format_dolly,
         args=sft_config,