From 26597ae386cf126435c50e9eb3f3734f5245a5cc Mon Sep 17 00:00:00 2001
From: hanqing <zhqinsjtu@gmail.com>
Date: Thu, 6 Feb 2025 01:58:39 -0600
Subject: [PATCH 1/6] Added APOLLO optimizer integration

---
 docs/source/en/trainer.md              | 133 ++++++++++++++++++++
 src/transformers/testing_utils.py      |   9 ++
 src/transformers/trainer.py            | 114 +++++++++++++++++
 src/transformers/training_args.py      |   2 +
 src/transformers/utils/__init__.py     |   1 +
 src/transformers/utils/import_utils.py |   5 +
 tests/trainer/test_trainer.py          | 163 +++++++++++++++++++++++++
 7 files changed, 427 insertions(+)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index e3a66f420424..cc851dce4f39 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -445,6 +445,139 @@ trainer.train()
 
 Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see [this appropriate section](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please [raise an issue on GitHub](https://github.com/huggingface/transformers/issues) if you encounter such issue.
 
+
+### APOLLO
+
+ Approximated Gradient Scaling for Memory Efficient LLM Optimization (APOLLO) is a memory-efficient low-rank training strategy that allows full-parameter learning for both pre-training and fine-tuning, while maintaining AdamW-level performance with SGD-like memory efficiency.
+
+ * **Ultra-low rank efficiency** → Requires much lower rank than GaLore—even rank 1 (APOLLO-Mini) suffices.
+ * **No expensive SVD computations** → Unlike GaLore, APOLLO leverages random projection, avoiding training stalls.
+
+First make sure to install APOLLO from its official repository:
+
+```bash
+pip install apollo-torch
+```
+
+Then simply add one of `["apollo_adamw"]` in `optim` together with `optim_target_modules`, which can be a list of strings, regex or full path corresponding to the target module names you want to adapt. Below is an end-to-end example script (make sure to `pip install trl datasets`):
+
+```python
+import torch
+import datasets
+import trl
+
+from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-apollo",
+    max_steps=100,
+    per_device_train_batch_size=2,
+    optim="apollo_adamw",
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
+)
+
+model_id = "google/gemma-2b"
+
+config = AutoConfig.from_pretrained(model_id)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_config(config).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=512,
+)
+
+trainer.train()
+```
+
+To pass extra arguments supported by APOLLO, you should pass correctly `optim_args`, for example:
+
+```python
+import torch
+import datasets
+import trl
+
+from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-galore",
+    max_steps=100,
+    per_device_train_batch_size=2,
+    optim="galore_adamw",
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
+    optim_args="proj=random,scale_type=tensor,rank=128,update_proj_gap=100,scale=1.0",
+
+)
+
+model_id = "google/gemma-2b"
+
+config = AutoConfig.from_pretrained(model_id)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_config(config).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=512,
+)
+
+trainer.train()
+```
+
+Currently only Linear layers are considered to use the APOLLO optimizers, while the remaining modueles are still using AdamW.
+
+You can read more about the method in the [original repository](https://github.com/zhuhanqing/APOLLO) or the [paper](https://arxiv.org/abs/2412.05270).
+
+
+You can also perform layer-wise APOLLO by simply post-pending the optimizer name with `layerwise` like below:
+
+```python
+import torch
+import datasets
+import trl
+
+from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-apollo",
+    max_steps=100,
+    per_device_train_batch_size=2,
+    optim="apollo_adamw_layerwise",
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
+)
+
+model_id = "google/gemma-2b"
+
+config = AutoConfig.from_pretrained(model_id)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_config(config).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=512,
+)
+
+trainer.train()
+```
+
+
 ### LOMO optimizer
 
 The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195).
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 6d1965e29d79..a653162cffef 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -62,6 +62,7 @@
     GGUF_MIN_VERSION,
     is_accelerate_available,
     is_apex_available,
+    is_apollo_torch_available,
     is_aqlm_available,
     is_auto_awq_available,
     is_auto_gptq_available,
@@ -403,6 +404,14 @@ def require_galore_torch(test_case):
     return unittest.skipUnless(is_galore_torch_available(), "test requires GaLore")(test_case)
 
 
+def require_apollo_torch(test_case):
+    """
+    Decorator marking a test that requires GaLore. These tests are skipped when APOLLO isn't installed.
+    https://github.com/zhuhanqing/APOLLO
+    """
+    return unittest.skipUnless(is_apollo_torch_available(), "test requires APOLLO")(test_case)
+
+
 def require_lomo(test_case):
     """
     Decorator marking a test that requires LOMO. These tests are skipped when LOMO-optim isn't installed.
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 74129a7e5c7f..2a0b4bcc12f1 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -151,6 +151,7 @@
     find_labels,
     is_accelerate_available,
     is_apex_available,
+    is_apollo_torch_available,
     is_bitsandbytes_available,
     is_datasets_available,
     is_galore_torch_available,
@@ -1582,6 +1583,119 @@ def optimizer_hook(param):
 
             if args.optim == OptimizerNames.GALORE_ADAFACTOR:
                 optimizer_kwargs.update({"scale_parameter": False, "relative_step": False})
+        elif args.optim in [
+            OptimizerNames.APOLLO_ADAMW,
+            OptimizerNames.APOLLO_ADAMW_LAYERWISE,
+        ]:
+            if not is_apollo_torch_available():
+                raise ImportError(
+                    "You need to install `apollo_torch` in order to use APOLLO optimizers"
+                    " install it with `pip install git+https://github.com/zhuhanqing/APOLLO`"
+                )
+            from apollo_torch import APOLLOAdamW
+
+            is_layerwise = args.optim.lower().endswith("layerwise")
+            if is_layerwise and args.parallel_mode == ParallelMode.DISTRIBUTED:
+                raise NotImplementedError("Layer-wise APOLLO does not support DDP at this time")
+
+            optimizer_mapping = {
+                OptimizerNames.APOLLO_ADAMW: APOLLOAdamW,
+                OptimizerNames.APOLLO_ADAMW_LAYERWISE: APOLLOAdamW,
+            }
+
+            optimizer_cls = optimizer_mapping[args.optim]
+
+            if args.optim_target_modules is None:
+                raise ValueError(
+                    "You need to define a `optim_target_modules` in order to properly use APOLLO optimizers"
+                )
+
+            if not isinstance(args.optim_target_modules, (list, str)):
+                raise ValueError(
+                    f"`optim_target_modules` has to be a list of strings, a string corresponding to a regex, or a specific module or 'all-linear', you passed {args.optim_target_modules}"
+                )
+
+            if model is None:
+                raise ValueError("You need to pass a model in order to correctly initialize a APOLLO optimizer.")
+
+            all_linear = (
+                isinstance(args.optim_target_modules, str)
+                and args.optim_target_modules.replace("_", "-") == "all-linear"
+            )
+
+            apollo_params = []
+            apollo_params_names = []
+            for module_name, module in model.named_modules():
+                target_module_exists, is_regex = check_target_module_exists(
+                    args.optim_target_modules, module_name, return_is_regex=True
+                )
+
+                if not isinstance(module, nn.Linear):
+                    # Warn in case we match but it's not a linear layer
+                    if target_module_exists and not is_regex:
+                        logger.warning(
+                            f"{module_name} has been matched but ignored as APOLLO only supports linear layers. Please double check your `optim_target_modules`!"
+                        )
+
+                    continue
+
+                if not target_module_exists and not all_linear:
+                    continue
+
+                apollo_params.append(module.weight)
+                apollo_params_names.append(module_name + ".weight")
+
+            if len(apollo_params) == 0:
+                raise ValueError(
+                    f"None of the target modules were found! ({args.optim_target_modules}). Please make sure to pass a valid `target_modules`."
+                )
+
+            non_apollo_params = [p for n, p in model.named_parameters() if n not in apollo_params_names]
+            apollo_optim_kwargs = {
+                "rank": int(optim_args.pop("rank", 128)),
+                "proj": optim_args.pop("proj", "random"),
+                "scale_type": optim_args.pop("scale_type", "channel"),
+                "update_proj_gap": int(optim_args.pop("update_proj_gap", 200)),
+                "scale": float(optim_args.pop("scale", 1.0)),
+                "proj_type": optim_args.pop("proj_type", "std"),
+            }
+
+            # The default args are from the official repository: https://github.com/zhuhanqing/APOLLO
+            param_groups = [
+                {"params": non_apollo_params},
+                {"params": apollo_params, **apollo_optim_kwargs},
+            ]
+
+            if is_layerwise:
+                # For layer-wise optimizers, the optimization step is done through post accumulation
+                # gradient hooks. The trick is to first attach these hooks to the model parameters then
+                # create a dummy optimizer that will perform no-ops in the Trainer.
+                # See the original implementation or the nice implementation from @hiyouga
+                # here: https://github.com/hiyouga/LLaMA-Factory/commit/8664262cde3919e10eaecbd66e8c5d356856362e#diff-ebe08ab14496dfb9e06075f0fdd36799ef6d1535cc4dd4715b74c4e3e06fe3ba
+                if args.gradient_accumulation_steps != 1:
+                    raise ValueError("Layerwise APOLLO optimizer do not support gradient accumulation !")
+
+                optimizer_dict = {}
+                for param in non_apollo_params:
+                    param_groups = [{"params": [param]}]
+                    optimizer_dict[param] = optimizer_cls(param_groups, **optimizer_kwargs)
+                for param in apollo_params:
+                    param_groups = [{"params": [param], **apollo_optim_kwargs}]
+                    optimizer_dict[param] = optimizer_cls(param_groups, **optimizer_kwargs)
+
+                def optimizer_hook(param):
+                    if param.grad is not None:
+                        optimizer_dict[param].step()
+                        optimizer_dict[param].zero_grad()
+
+                for param in model.parameters():
+                    if param.requires_grad:
+                        param.register_post_accumulate_grad_hook(optimizer_hook)
+
+                optimizer_cls = LayerWiseDummyOptimizer
+                optimizer_kwargs.update({"optimizer_dict": optimizer_dict})
+
+            optimizer_kwargs.update({"params": param_groups})
         elif args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
             if not is_lomo_available():
                 raise ImportError(
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 5bc31b616003..123b5d5e8863 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -184,6 +184,8 @@ class OptimizerNames(ExplicitEnum):
     GROKADAMW = "grokadamw"
     SCHEDULE_FREE_ADAMW = "schedule_free_adamw"
     SCHEDULE_FREE_SGD = "schedule_free_sgd"
+    APOLLO_ADAMW = "apollo_adamw"
+    APOLLO_ADAMW_LAYERWISE = "apollo_adamw_layerwise"
 
 
 # Sometimes users will pass in a `str` repr of a dict in the CLI
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index e5aedf5916fa..4b226ef4dc24 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -118,6 +118,7 @@
     get_torch_version,
     is_accelerate_available,
     is_apex_available,
+    is_apollo_torch_available,
     is_aqlm_available,
     is_auto_awq_available,
     is_auto_gptq_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index ac07281b3d33..9c7f710482ab 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -98,6 +98,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 
 _accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
 _apex_available = _is_package_available("apex")
+_apollo_torch_available = _is_package_available("apollo_torch")
 _aqlm_available = _is_package_available("aqlm")
 _vptq_available, _vptq_version = _is_package_available("vptq", return_version=True)
 _av_available = importlib.util.find_spec("av") is not None
@@ -402,6 +403,10 @@ def is_galore_torch_available():
     return _galore_torch_available
 
 
+def is_apollo_torch_available():
+    return _apollo_torch_available
+
+
 def is_lomo_available():
     return _lomo_available
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 3de94511fb8e..b2872fc83af8 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -66,6 +66,7 @@
     get_tests_dir,
     is_staging_test,
     require_accelerate,
+    require_apollo_torch,
     require_bitsandbytes,
     require_deepspeed,
     require_galore_torch,
@@ -2235,6 +2236,168 @@ def test_galore_lr_display_with_scheduler(self):
         # warm up steps << total steps
         self.assertTrue(len(decreasing_lrs) > len(increasing_lrs))
 
+    @require_apollo_torch
+    @require_torch_gpu
+    def test_apollo(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="apollo_adamw",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+        # Check this works
+        _ = trainer.train()
+
+    @require_apollo_torch
+    @require_torch_gpu
+    def test_apollo_extra_args(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="apollo_adamw",
+            optim_args="proj=random,scale_type=tensor,rank=1,update_proj_gap=100,scale=128.0",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+        # Check this works
+        _ = trainer.train()
+
+    @require_apollo_torch
+    @require_torch_gpu
+    def test_apollo_layerwise(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="apollo_adamw_layerwise",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+        # Check this works
+        _ = trainer.train()
+
+    @require_apollo_torch
+    @require_torch_gpu
+    def test_apollo_layerwise_with_scheduler(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="apollo_adamw_layerwise",
+            lr_scheduler_type="cosine",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+        # Check this works
+        _ = trainer.train()
+
+    @require_apollo_torch
+    @require_torch_gpu
+    def test_apollo_lr_display_without_scheduler(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        learning_rate = 1e-9
+        num_steps = 10
+
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=learning_rate,
+            logging_steps=5,
+            optim="apollo_adamw",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
+
+        # reflects displayed lr in trainer
+        self.assertEqual(trainer.get_learning_rates(), [learning_rate, learning_rate])
+
+    @require_apollo_torch
+    @require_torch_gpu
+    def test_apollo_lr_display_with_scheduler(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        learning_rate = 2e-4
+        num_train_epochs = 10
+        num_warmup_steps = 5
+
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            num_train_epochs=num_train_epochs,
+            learning_rate=learning_rate,
+            warmup_steps=num_warmup_steps,
+            lr_scheduler_type="cosine",
+            logging_steps=1,
+            optim="apollo_adamw",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+        # creating log history of trainer, results don't matter
+        trainer.train()
+        logs = trainer.state.log_history[1:][:-1]
+
+        # reach given learning rate peak and end with 0 lr
+        self.assertTrue(logs[num_warmup_steps - 2]["learning_rate"] == learning_rate)
+        self.assertTrue(logs[-1]["learning_rate"] == 0)
+
+        # increasing and decreasing pattern of lrs
+        increasing_lrs = [
+            logs[i]["learning_rate"] < logs[i + 1]["learning_rate"]
+            for i in range(len(logs))
+            if i < num_warmup_steps - 2
+        ]
+        decreasing_lrs = [
+            logs[i]["learning_rate"] > logs[i + 1]["learning_rate"]
+            for i in range(len(logs) - 1)
+            if i >= num_warmup_steps - 2
+        ]
+
+        self.assertTrue(all(increasing_lrs))
+        self.assertTrue(all(decreasing_lrs))
+
+        # warm up steps << total steps
+        self.assertTrue(len(decreasing_lrs) > len(increasing_lrs))
+
     @require_torch_multi_accelerator
     def test_data_is_not_parallelized_when_model_is_parallel(self):
         model = RegressionModel()

From 9fc525b861b7be6966475290dc2f6551b5d072d8 Mon Sep 17 00:00:00 2001
From: hanqing <zhqinsjtu@gmail.com>
Date: Thu, 6 Feb 2025 21:22:35 -0600
Subject: [PATCH 2/6] fix comment

---
 docs/source/en/trainer.md         | 6 ++----
 src/transformers/training_args.py | 9 ++++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 4ff854dbc056..e30ab19e1ff6 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -545,7 +545,7 @@ import torch
 import datasets
 import trl
 
-from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
 
 train_dataset = datasets.load_dataset('imdb', split='train')
 
@@ -559,10 +559,8 @@ args = TrainingArguments(
 
 model_id = "google/gemma-2b"
 
-config = AutoConfig.from_pretrained(model_id)
-
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_config(config).to(0)
+model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
 
 trainer = trl.SFTTrainer(
     model=model,
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 123b5d5e8863..36c2224b210e 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -791,11 +791,10 @@ class TrainingArguments:
             [original code](https://github.com/neelsjain/NEFTune). Support transformers `PreTrainedModel` and also
             `PeftModel` from peft. The original paper used values in the range [5.0, 15.0].
         optim_target_modules (`Union[str, List[str]]`, *optional*):
-            The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
-            https://arxiv.org/abs/2403.03507
-            See: https://github.com/jiaweizzhao/GaLore for more details. You need to make sure to pass a valid GaloRe
-            optimizer, e.g. one of: "galore_adamw", "galore_adamw_8bit", "galore_adafactor" and make sure that the target modules are `nn.Linear` modules
-            only.
+            The target modules to optimize, i.e. the module names that you would like to train.
+            Currently used for the GaLore algorithm (https://arxiv.org/abs/2403.03507) and APOLLO algorithm (https://arxiv.org/abs/2412.05270).
+            See GaLore implementation (https://github.com/jiaweizzhao/GaLore) and APOLLO implementation (https://github.com/zhuhanqing/APOLLO) for more details.
+            You need to make sure to pass a valid GaLore or APOLLO optimizer, e.g., one of: "apollo_adamw", "galore_adamw", "galore_adamw_8bit", "galore_adafactor" and make sure that the target modules are `nn.Linear` modules only.
 
         batch_eval_metrics (`Optional[bool]`, defaults to `False`):
             If set to `True`, evaluation will call compute_metrics at the end of each batch to accumulate statistics

From a884215004e21a3ddb6fe546ddf8d43d544267e6 Mon Sep 17 00:00:00 2001
From: hanqing <zhqinsjtu@gmail.com>
Date: Thu, 6 Feb 2025 22:44:30 -0600
Subject: [PATCH 3/6] Remove redundancy: Modularize low-rank optimizer
 construction

---
 src/transformers/trainer.py | 281 +++++++++++++-----------------------
 1 file changed, 99 insertions(+), 182 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index cbb1573a16ef..0f8f20576606 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1311,6 +1311,99 @@ def get_optimizer_cls_and_kwargs(
             "betas": (args.adam_beta1, args.adam_beta2),
             "eps": args.adam_epsilon,
         }
+
+        # Helper function for GaLore and Apollo optimizers
+        def setup_low_rank_optimizer(optimizer_name, optimizer_mapping, optim_kwargs, is_layerwise_supported=True):
+            """
+            Helper function to set up low-rank optimizers like GaLore and Apollo.
+
+            Args:
+                optimizer_name (str): Name of the optimizer.
+                optimizer_mapping (dict): Mapping of optimizer names to their classes.
+                optim_kwargs (dict): Keyword arguments for the optimizer.
+                is_layerwise_supported (bool): Whether layerwise optimization is supported.
+
+            Returns:
+                Tuple[Any, Any]: Optimizer class and updated optimizer kwargs.
+            """
+            is_layerwise = optimizer_name.lower().endswith("layerwise")
+            if is_layerwise and args.parallel_mode == ParallelMode.DISTRIBUTED and is_layerwise_supported:
+                raise NotImplementedError(f"Layer-wise {optimizer_name} does not support DDP at this time")
+
+            optimizer_cls = optimizer_mapping[optimizer_name]
+
+            if args.optim_target_modules is None:
+                raise ValueError(f"You need to define `optim_target_modules` to use {optimizer_name} optimizers")
+
+            if not isinstance(args.optim_target_modules, (list, str)):
+                raise ValueError(
+                    f"`optim_target_modules` must be a list of strings, a regex string, or 'all-linear'. Got: {args.optim_target_modules}"
+                )
+
+            if model is None:
+                raise ValueError(f"You need to pass a model to initialize {optimizer_name} optimizer.")
+
+            all_linear = (
+                isinstance(args.optim_target_modules, str)
+                and args.optim_target_modules.replace("_", "-") == "all-linear"
+            )
+
+            target_params = []
+            target_params_names = []
+            for module_name, module in model.named_modules():
+                target_module_exists, is_regex = check_target_module_exists(
+                    args.optim_target_modules, module_name, return_is_regex=True
+                )
+
+                if not isinstance(module, nn.Linear):
+                    if target_module_exists and not is_regex:
+                        logger.warning(
+                            f"{module_name} matched but ignored. {optimizer_name} only supports linear layers."
+                        )
+                    continue
+
+                if not target_module_exists and not all_linear:
+                    continue
+
+                target_params.append(module.weight)
+                target_params_names.append(module_name + ".weight")
+
+            if len(target_params) == 0:
+                raise ValueError(f"No target modules found for {optimizer_name} ({args.optim_target_modules}).")
+
+            non_target_params = [p for n, p in model.named_parameters() if n not in target_params_names]
+            optim_kwargs.update(optim_args)
+
+            param_groups = [
+                {"params": non_target_params},
+                {"params": target_params, **optim_kwargs},
+            ]
+
+            if is_layerwise:
+                if args.gradient_accumulation_steps != 1:
+                    raise ValueError(f"Layerwise {optimizer_name} does not support gradient accumulation!")
+
+                optimizer_dict = {}
+                for param in non_target_params:
+                    optimizer_dict[param] = optimizer_cls([{"params": [param]}], **optimizer_kwargs)
+                for param in target_params:
+                    optimizer_dict[param] = optimizer_cls([{"params": [param], **optim_kwargs}], **optimizer_kwargs)
+
+                def optimizer_hook(param):
+                    if param.grad is not None:
+                        optimizer_dict[param].step()
+                        optimizer_dict[param].zero_grad()
+
+                for param in model.parameters():
+                    if param.requires_grad:
+                        param.register_post_accumulate_grad_hook(optimizer_hook)
+
+                optimizer_cls = LayerWiseDummyOptimizer
+                optimizer_kwargs.update({"optimizer_dict": optimizer_dict})
+
+            optimizer_kwargs.update({"params": param_groups})
+            return optimizer_cls, optimizer_kwargs
+
         if args.optim == OptimizerNames.ADAFACTOR:
             optimizer_cls = Adafactor
             optimizer_kwargs.update({"scale_parameter": False, "relative_step": False})
@@ -1472,10 +1565,6 @@ def get_optimizer_cls_and_kwargs(
                 )
             from galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit
 
-            is_layerwise = args.optim.lower().endswith("layerwise")
-            if is_layerwise and args.parallel_mode == ParallelMode.DISTRIBUTED:
-                raise NotImplementedError("Layer-wise GaLore does not support DDP at this time")
-
             optimizer_mapping = {
                 OptimizerNames.GALORE_ADAMW: GaLoreAdamW,
                 OptimizerNames.GALORE_ADAMW_8BIT: GaLoreAdamW8bit,
@@ -1485,59 +1574,6 @@ def get_optimizer_cls_and_kwargs(
                 OptimizerNames.GALORE_ADAFACTOR_LAYERWISE: GaLoreAdafactor,
             }
 
-            optimizer_cls = optimizer_mapping[args.optim]
-
-            if args.optim_target_modules is None:
-                raise ValueError(
-                    "You need to define a `optim_target_modules` in order to properly use GaLore optimizers"
-                )
-
-            if not isinstance(args.optim_target_modules, (list, str)):
-                raise ValueError(
-                    f"`optim_target_modules` has to be a list of strings, a string corresponding to a regex, or a specific module or 'all-linear', you passed {args.optim_target_modules}"
-                )
-
-            if model is None:
-                raise ValueError("You need to pass a model in order to correctly initialize a GaLore optimizer.")
-
-            logger.warning(
-                "Activated GaLoRE fine-tuning, depending on your model size and hardware, the training might take a while before starting. Please be patient !"
-            )
-
-            all_linear = (
-                isinstance(args.optim_target_modules, str)
-                and args.optim_target_modules.replace("_", "-") == "all-linear"
-            )
-
-            galore_params = []
-            galore_params_names = []
-            for module_name, module in model.named_modules():
-                target_module_exists, is_regex = check_target_module_exists(
-                    args.optim_target_modules, module_name, return_is_regex=True
-                )
-
-                if not isinstance(module, nn.Linear):
-                    # Warn in case we match but it's not a linear layer
-                    if target_module_exists and not is_regex:
-                        logger.warning(
-                            f"{module_name} has been matched but ignored as GaLore only supports linear layers. Please double check your `optim_target_modules`!"
-                        )
-
-                    continue
-
-                if not target_module_exists and not all_linear:
-                    continue
-
-                galore_params.append(module.weight)
-                galore_params_names.append(module_name + ".weight")
-
-            if len(galore_params) == 0:
-                raise ValueError(
-                    f"None of the target modules were found! ({args.optim_target_modules}). Please make sure to pass a valid `target_modules`."
-                )
-
-            non_galore_params = [p for n, p in model.named_parameters() if n not in galore_params_names]
-
             galore_optim_kwargs = {
                 "rank": int(optim_args.pop("rank", 128)),
                 "update_proj_gap": int(optim_args.pop("update_proj_gap", 200)),
@@ -1545,43 +1581,9 @@ def get_optimizer_cls_and_kwargs(
                 "proj_type": optim_args.pop("proj_type", "std"),
             }
 
-            # The default args are from the official repository: https://github.com/jiaweizzhao/GaLore
-            param_groups = [
-                {"params": non_galore_params},
-                {"params": galore_params, **galore_optim_kwargs},
-            ]
-
-            if is_layerwise:
-                # For layer-wise optimizers, the optimization step is done through post accumulation
-                # gradient hooks. The trick is to first attach these hooks to the model parameters then
-                # create a dummy optimizer that will perform no-ops in the Trainer.
-                # See the original implementation or the nice implementation from @hiyouga
-                # here: https://github.com/hiyouga/LLaMA-Factory/commit/8664262cde3919e10eaecbd66e8c5d356856362e#diff-ebe08ab14496dfb9e06075f0fdd36799ef6d1535cc4dd4715b74c4e3e06fe3ba
-                if args.gradient_accumulation_steps != 1:
-                    raise ValueError("Layerwise GaLoRE optimizer do not support gradient accumulation !")
-
-                optimizer_dict = {}
-                for param in non_galore_params:
-                    param_groups = [{"params": [param]}]
-                    optimizer_dict[param] = optimizer_cls(param_groups, **optimizer_kwargs)
-                for param in galore_params:
-                    param_groups = [{"params": [param], **galore_optim_kwargs}]
-                    optimizer_dict[param] = optimizer_cls(param_groups, **optimizer_kwargs)
-
-                def optimizer_hook(param):
-                    if param.grad is not None:
-                        optimizer_dict[param].step()
-                        optimizer_dict[param].zero_grad()
-
-                for param in model.parameters():
-                    if param.requires_grad:
-                        param.register_post_accumulate_grad_hook(optimizer_hook)
-
-                optimizer_cls = LayerWiseDummyOptimizer
-                optimizer_kwargs.update({"optimizer_dict": optimizer_dict})
-
-            optimizer_kwargs.update({"params": param_groups})
-
+            optimizer_cls, optimizer_kwargs = setup_low_rank_optimizer(
+                args.optim, optimizer_mapping, galore_optim_kwargs
+            )
             if args.optim == OptimizerNames.GALORE_ADAFACTOR:
                 optimizer_kwargs.update({"scale_parameter": False, "relative_step": False})
         elif args.optim in [
@@ -1595,63 +1597,11 @@ def optimizer_hook(param):
                 )
             from apollo_torch import APOLLOAdamW
 
-            is_layerwise = args.optim.lower().endswith("layerwise")
-            if is_layerwise and args.parallel_mode == ParallelMode.DISTRIBUTED:
-                raise NotImplementedError("Layer-wise APOLLO does not support DDP at this time")
-
             optimizer_mapping = {
                 OptimizerNames.APOLLO_ADAMW: APOLLOAdamW,
                 OptimizerNames.APOLLO_ADAMW_LAYERWISE: APOLLOAdamW,
             }
 
-            optimizer_cls = optimizer_mapping[args.optim]
-
-            if args.optim_target_modules is None:
-                raise ValueError(
-                    "You need to define a `optim_target_modules` in order to properly use APOLLO optimizers"
-                )
-
-            if not isinstance(args.optim_target_modules, (list, str)):
-                raise ValueError(
-                    f"`optim_target_modules` has to be a list of strings, a string corresponding to a regex, or a specific module or 'all-linear', you passed {args.optim_target_modules}"
-                )
-
-            if model is None:
-                raise ValueError("You need to pass a model in order to correctly initialize a APOLLO optimizer.")
-
-            all_linear = (
-                isinstance(args.optim_target_modules, str)
-                and args.optim_target_modules.replace("_", "-") == "all-linear"
-            )
-
-            apollo_params = []
-            apollo_params_names = []
-            for module_name, module in model.named_modules():
-                target_module_exists, is_regex = check_target_module_exists(
-                    args.optim_target_modules, module_name, return_is_regex=True
-                )
-
-                if not isinstance(module, nn.Linear):
-                    # Warn in case we match but it's not a linear layer
-                    if target_module_exists and not is_regex:
-                        logger.warning(
-                            f"{module_name} has been matched but ignored as APOLLO only supports linear layers. Please double check your `optim_target_modules`!"
-                        )
-
-                    continue
-
-                if not target_module_exists and not all_linear:
-                    continue
-
-                apollo_params.append(module.weight)
-                apollo_params_names.append(module_name + ".weight")
-
-            if len(apollo_params) == 0:
-                raise ValueError(
-                    f"None of the target modules were found! ({args.optim_target_modules}). Please make sure to pass a valid `target_modules`."
-                )
-
-            non_apollo_params = [p for n, p in model.named_parameters() if n not in apollo_params_names]
             apollo_optim_kwargs = {
                 "rank": int(optim_args.pop("rank", 128)),
                 "proj": optim_args.pop("proj", "random"),
@@ -1661,42 +1611,9 @@ def optimizer_hook(param):
                 "proj_type": optim_args.pop("proj_type", "std"),
             }
 
-            # The default args are from the official repository: https://github.com/zhuhanqing/APOLLO
-            param_groups = [
-                {"params": non_apollo_params},
-                {"params": apollo_params, **apollo_optim_kwargs},
-            ]
-
-            if is_layerwise:
-                # For layer-wise optimizers, the optimization step is done through post accumulation
-                # gradient hooks. The trick is to first attach these hooks to the model parameters then
-                # create a dummy optimizer that will perform no-ops in the Trainer.
-                # See the original implementation or the nice implementation from @hiyouga
-                # here: https://github.com/hiyouga/LLaMA-Factory/commit/8664262cde3919e10eaecbd66e8c5d356856362e#diff-ebe08ab14496dfb9e06075f0fdd36799ef6d1535cc4dd4715b74c4e3e06fe3ba
-                if args.gradient_accumulation_steps != 1:
-                    raise ValueError("Layerwise APOLLO optimizer do not support gradient accumulation !")
-
-                optimizer_dict = {}
-                for param in non_apollo_params:
-                    param_groups = [{"params": [param]}]
-                    optimizer_dict[param] = optimizer_cls(param_groups, **optimizer_kwargs)
-                for param in apollo_params:
-                    param_groups = [{"params": [param], **apollo_optim_kwargs}]
-                    optimizer_dict[param] = optimizer_cls(param_groups, **optimizer_kwargs)
-
-                def optimizer_hook(param):
-                    if param.grad is not None:
-                        optimizer_dict[param].step()
-                        optimizer_dict[param].zero_grad()
-
-                for param in model.parameters():
-                    if param.requires_grad:
-                        param.register_post_accumulate_grad_hook(optimizer_hook)
-
-                optimizer_cls = LayerWiseDummyOptimizer
-                optimizer_kwargs.update({"optimizer_dict": optimizer_dict})
-
-            optimizer_kwargs.update({"params": param_groups})
+            optimizer_cls, optimizer_kwargs = setup_low_rank_optimizer(
+                args.optim, optimizer_mapping, apollo_optim_kwargs
+            )
         elif args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
             if not is_lomo_available():
                 raise ImportError(

From 14c8cd917bc8e4cc5412997efedf27ebe67fc6a0 Mon Sep 17 00:00:00 2001
From: hanqing <zhqinsjtu@gmail.com>
Date: Thu, 6 Feb 2025 22:49:42 -0600
Subject: [PATCH 4/6] Remove redundancy: Remove useless comment

---
 src/transformers/trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 0f8f20576606..4db77681995d 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1312,7 +1312,6 @@ def get_optimizer_cls_and_kwargs(
             "eps": args.adam_epsilon,
         }
 
-        # Helper function for GaLore and Apollo optimizers
         def setup_low_rank_optimizer(optimizer_name, optimizer_mapping, optim_kwargs, is_layerwise_supported=True):
             """
             Helper function to set up low-rank optimizers like GaLore and Apollo.

From 935e507abfd1f21befbcd6b901560ece75970fbb Mon Sep 17 00:00:00 2001
From: hanqing <zhqinsjtu@gmail.com>
Date: Sat, 8 Feb 2025 00:37:27 -0600
Subject: [PATCH 5/6] Fix comment: Add typing

---
 src/transformers/trainer.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index dbabfd6bab0d..627d3d79cd34 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1312,7 +1312,12 @@ def get_optimizer_cls_and_kwargs(
             "eps": args.adam_epsilon,
         }
 
-        def setup_low_rank_optimizer(optimizer_name, optimizer_mapping, optim_kwargs, is_layerwise_supported=True):
+        def setup_low_rank_optimizer(
+            optimizer_name: str,
+            optimizer_mapping: Dict[str, Any],
+            optim_kwargs: Dict[str, Any],
+            is_layerwise_supported: bool = True,
+        ) -> Tuple[Any, Any]:
             """
             Helper function to set up low-rank optimizers like GaLore and Apollo.
 

From 966bf9b1c87cbdccd30de40133ff84259236d50c Mon Sep 17 00:00:00 2001
From: hanqing <zhqinsjtu@gmail.com>
Date: Sat, 8 Feb 2025 01:25:35 -0600
Subject: [PATCH 6/6] Fix comment: Rewrite apollo desc

---
 docs/source/en/trainer.md | 106 ++++++++++++--------------------------
 1 file changed, 33 insertions(+), 73 deletions(-)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index e30ab19e1ff6..96897948b1e8 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -443,28 +443,36 @@ trainer.train()
 
 Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see [this appropriate section](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please [raise an issue on GitHub](https://github.com/huggingface/transformers/issues) if you encounter such issue.
 
-
 ### APOLLO
 
- Approximated Gradient Scaling for Memory Efficient LLM Optimization (APOLLO) is a memory-efficient low-rank training strategy that allows full-parameter learning for both pre-training and fine-tuning, while maintaining AdamW-level performance with SGD-like memory efficiency.
+Approximated Gradient Scaling for Memory Efficient LLM Optimization (APOLLO) is a memory-efficient training strategy that allows full-parameter learning for both pre-training and fine-tuning, while maintaining AdamW-level performance with SGD-like memory efficiency.
+
+* **Ultra-low rank efficiency** → Requires much lower rank than GaLore—even rank 1 (APOLLO-Mini) suffices.
+* **No expensive SVD computations** → Unlike GaLore, APOLLO leverages random projection, avoiding training stalls.
 
- * **Ultra-low rank efficiency** → Requires much lower rank than GaLore—even rank 1 (APOLLO-Mini) suffices.
- * **No expensive SVD computations** → Unlike GaLore, APOLLO leverages random projection, avoiding training stalls.
+You can read more about the method in the [original repository](https://github.com/zhuhanqing/APOLLO) or the [APOLLO: SGD-like Memory, AdamW-level Performance](https://arxiv.org/abs/2412.05270).
 
-First make sure to install APOLLO from its official repository:
+First, make sure to install APOLLO from its official repository:
 
 ```bash
 pip install apollo-torch
 ```
 
-Then simply add one of `["apollo_adamw"]` in `optim` together with `optim_target_modules`, which can be a list of strings, regex or full path corresponding to the target module names you want to adapt. Below is an end-to-end example script (make sure to `pip install trl datasets`):
+Then, APOLLO optimizers can be used simply by setting `optim="apollo_adamw"` and specifying `optim_target_modules`.
+`optim_target_modules` can be a list of strings, regex or full path corresponding to the target module names you want to adapt. 
+Currently, only Linear layers are considered to use the APOLLO optimizers, i.e., included in `optim_target_modules,` while the remaining models are still using AdamW. 
+
+
+You can also enable layer-wise APOLLO by appending "layerwise" to the optimizer name (optim="apollo_adamw_layerwise"), the same as layer-wise GaLore. This saves additional memory for gradient by performing weight updates layer by layer.
+
+Below is an end-to-end example script (make sure to `pip install trl datasets`):
 
 ```python
 import torch
 import datasets
 import trl
 
-from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
 
 train_dataset = datasets.load_dataset('imdb', split='train')
 
@@ -478,10 +486,8 @@ args = TrainingArguments(
 
 model_id = "google/gemma-2b"
 
-config = AutoConfig.from_pretrained(model_id)
-
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_config(config).to(0)
+model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
 
 trainer = trl.SFTTrainer(
     model=model,
@@ -494,86 +500,40 @@ trainer = trl.SFTTrainer(
 trainer.train()
 ```
 
-To pass extra arguments supported by APOLLO, you should pass correctly `optim_args`, for example:
 
-```python
-import torch
-import datasets
-import trl
+You can further customize APOLLO’s behavior by passing hyperparameters using `optim_args`.
 
-from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+| Parameter         | Description |
+|------------------|-------------|
+| `rank` | Rank of the auxiliary sub-space used for gradient scaling. <br> **APOLLO (default=256)** → Works well for 1B and 7B models. <br> **APOLLO-Mini (default=1)** |
+| `scale_type` | How scaling factors are applied. <br> **`channel`** → Per-channel scaling (used in APOLLO). <br> **`tensor`** → Per-tensor scaling (used in APOLLO-Mini). |
+| `scale` | Adjusts gradient updates to stabilize training. <br> **APOLLO (default=1.0)** <br> **APOLLO-Mini (default=128)** |
+| `update_proj_gap` | Steps before updating projection matrices. Default: **200**. |
+| `proj` | Type of projection. Default: **`random`**. |
 
-train_dataset = datasets.load_dataset('imdb', split='train')
 
-args = TrainingArguments(
-    output_dir="./test-galore",
-    max_steps=100,
-    per_device_train_batch_size=2,
-    optim="galore_adamw",
-    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
-    optim_args="proj=random,scale_type=tensor,rank=128,update_proj_gap=100,scale=1.0",
-
-)
-
-model_id = "google/gemma-2b"
-
-config = AutoConfig.from_pretrained(model_id)
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_config(config).to(0)
-
-trainer = trl.SFTTrainer(
-    model=model,
-    args=args,
-    train_dataset=train_dataset,
-    dataset_text_field='text',
-    max_seq_length=512,
-)
-
-trainer.train()
-```
-
-Currently only Linear layers are considered to use the APOLLO optimizers, while the remaining modueles are still using AdamW.
+<Tip>
 
-You can read more about the method in the [original repository](https://github.com/zhuhanqing/APOLLO) or the [paper](https://arxiv.org/abs/2412.05270).
+The `scale` parameter can be set to `n/r`, where `n` is the original space dimension and `r` is the low-rank space dimension.
+Alternatively, you can achieve a similar effect by adjusting the learning rate, while keeping scale at its default value.
 
+</Tip>
 
-You can also perform layer-wise APOLLO by simply post-pending the optimizer name with `layerwise` like below:
+For example, you can enable APOLLO-Mini (rank=1 for extreme memory efficiency) by passing `optim_args`:
 
 ```python
-import torch
-import datasets
-import trl
-
-from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
-
-train_dataset = datasets.load_dataset('imdb', split='train')
 
 args = TrainingArguments(
-    output_dir="./test-apollo",
+    output_dir="./test-galore",
     max_steps=100,
     per_device_train_batch_size=2,
-    optim="apollo_adamw_layerwise",
-    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
-)
-
-model_id = "google/gemma-2b"
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
+    optim="apollo_adamw",
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
+    optim_args="proj=random,rank=1,scale=128.0,scale_type=tensor,update_proj_gap=200",
 
-trainer = trl.SFTTrainer(
-    model=model,
-    args=args,
-    train_dataset=train_dataset,
-    dataset_text_field='text',
-    max_seq_length=512,
 )
-
-trainer.train()
 ```
 
-
 ### LOMO optimizer
 
 The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195).