nerdy-tech-com-gitub
diff --git a/‎scripts/check_copyright_header.py renamed to ‎.github/scripts/check_copyright_header.py b/‎scripts/check_copyright_header.py renamed to ‎.github/scripts/check_copyright_header.py
diff --git a/‎scripts/markdown_link_check_config.json renamed to ‎.github/scripts/markdown_link_check_config.json b/‎scripts/markdown_link_check_config.json renamed to ‎.github/scripts/markdown_link_check_config.json
diff --git a/‎scripts/spellcheck.sh renamed to ‎.github/scripts/spellcheck.sh
Lines changed: 1 addition & 1 deletion b/‎scripts/spellcheck.sh renamed to ‎.github/scripts/spellcheck.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/spellcheck_conf/spellcheck.yaml renamed to ‎.github/scripts/spellcheck_conf/spellcheck.yaml
Lines changed: 2 additions & 2 deletions b/‎scripts/spellcheck_conf/spellcheck.yaml renamed to ‎.github/scripts/spellcheck_conf/spellcheck.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/spellcheck_conf/wordlist.txt renamed to ‎.github/scripts/spellcheck_conf/wordlist.txt
Lines changed: 41 additions & 0 deletions b/‎scripts/spellcheck_conf/wordlist.txt renamed to ‎.github/scripts/spellcheck_conf/wordlist.txt
Lines changed: 41 additions & 0 deletions
diff --git a/‎.github/workflows/spellcheck.yml
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/spellcheck.yml
Lines changed: 5 additions & 5 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 4 additions & 4 deletions b/‎CONTRIBUTING.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 0 deletions b/‎README.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/LLM_finetuning.md
Lines changed: 4 additions & 4 deletions b/‎docs/LLM_finetuning.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/multi_gpu.md
Lines changed: 58 additions & 36 deletions b/‎docs/multi_gpu.md
Lines changed: 58 additions & 36 deletions
@@ -19,5 +19,5 @@ done
 if [ ! "$sources_arg" ]; then
 	echo "No files to spellcheck"
 else
-	pyspelling -c scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources_arg
+	pyspelling -c .github/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources_arg
 fi
@@ -5,8 +5,8 @@ matrix:
     d: en_US
   dictionary:
     wordlists:
-    - scripts/spellcheck_conf/wordlist.txt
-    output: scripts/spellcheck_conf/wordlist.dic
+    - .github/scripts/spellcheck_conf/wordlist.txt
+    output: .github/scripts/spellcheck_conf/wordlist.dic
     encoding: utf-8
   pipeline:
   - pyspelling.filters.context:
 
@@ -1310,3 +1310,44 @@ leaderboards
 txn
 ollama
 tavily
+AgentExecutor
+LangGraph
+langgraph
+vectorstore
+CMake
+Chipset
+JBR
+JNI
+MLCChat
+MTP
+MacBook
+Moreau
+NDK
+NDK's
+OSX
+OnePlus
+OxygenOS
+SoC
+Sonoma
+TVM
+Thierry
+Wifi
+chipset
+feb
+moreau
+octo
+rustc
+rustup
+sha
+tmoreau
+toolchain
+wifi
+AgentFinish
+ReAct
+customizable
+Kaggle
+SalesBot
+Weaviate
+MediaGen
+SDXL
+SVD
@@ -20,11 +20,11 @@ jobs:
         uses: gaurav-nelson/[email protected]
         with:
           use-verbose-mode: 'yes'
-          config-file: "scripts/markdown_link_check_config.json"
+          config-file: ".github/scripts/markdown_link_check_config.json"
 
       - name: Get changed files
         id: changed-files
-        uses: tj-actions/changed-files@v29.0.4
+        uses: tj-actions/changed-files@v41.0.0
         with:
 
           files: |
@@ -42,7 +42,7 @@ jobs:
 
       - name: Get changed files
         id: changed-files
-        uses: tj-actions/changed-files@v29.0.4
+        uses: tj-actions/changed-files@v41.0.0
         with:
           files: |
             **/*.md
@@ -56,11 +56,11 @@ jobs:
           if [ ! "$sources" ]; then
             echo "No files to spellcheck"
           else
-            pyspelling -c $GITHUB_WORKSPACE/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources
+            pyspelling -c $GITHUB_WORKSPACE/.github/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources
           fi
 
       - name: In the case of misspellings
         if: ${{ failure() }}
         run: |
           echo "Please fix the misspellings. If you are sure about some of them, "
-          echo "so append those to scripts/spellcheck_conf/wordlist.txt"
+          echo "so append those to .github/scripts/spellcheck_conf/wordlist.txt"
@@ -43,17 +43,17 @@ For development and contributing to llama-recipes please install from source wit
 pip install -U pip setuptools
 pip install --extra-index-url https://download.pytorch.org/whl/test/cu118 -e .[tests,auditnlg,vllm]
 ```
-The unit tests can be found in the [tests](./tests/) folder and you can run them from the main directory using:
+The unit tests can be found in the [src/tests](./src/tests/) folder and you can run them from the main directory using:
 ```
-python -m pytest tests/
+python -m pytest src/tests/
 ```
 To run all tests of a single file you can give the filename directly:
 ```
-python -m pytest tests/test_finetuning.py
+python -m pytest src/tests/test_finetuning.py
 ```
 To run a specific test you can filter for its name with
 ```
-python -m pytest tests/test_finetuning.py -k test_finetuning_peft
+python -m pytest src/tests/test_finetuning.py -k test_finetuning_peft
 ```
 To add a new test simply create a new test file under the tests folder (filename has to start with `test_`).
 Group tests spanning the same feature in the same file and create a subfolder if the tests are very extensive.
@@ -64,6 +64,10 @@ If you want to use PyTorch nightlies instead of the stable release, go to [this
 ### Installing
 Llama-recipes provides a pip distribution for easy install and usage in other projects. Alternatively, it can be installed from source.
 
+> [!NOTE]
+> Ensure you use the correct CUDA version (from `nvidia-smi`) when installing the PyTorch wheels. Here we are using 11.8 as `cu118`.
+> H100 GPUs work better with CUDA >12.0
+
 #### Install with pip
 ```
 pip install llama-recipes
 
@@ -1,10 +1,10 @@
 ## LLM Fine-Tuning
 
-Here we discuss fine-tuning Llama 2 with a couple of different recipes. We will cover two scenarios here:
+Here we discuss fine-tuning Meta Llama 3 with a couple of different recipes. We will cover two scenarios here:
 
 
 ## 1. **Parameter Efficient Model Fine-Tuning**
- This helps make the fine-tuning process more affordable even on 1 consumer grade GPU. These methods enable us to keep the whole model frozen and to just add tiny learnable parameters/ layers into the model. In this way, we just train a very tiny portion of the parameters. The most famous method in this category is [LORA](https://arxiv.org/pdf/2106.09685.pdf), LLaMA Adapter and Prefix-tuning.
+ This helps make the fine-tuning process more affordable even on 1 consumer grade GPU. These methods enable us to keep the whole model frozen and to just add tiny learnable parameters/ layers into the model. In this way, we just train a very tiny portion of the parameters. The most famous method in this category is [LORA](https://arxiv.org/pdf/2106.09685.pdf), Llama Adapter and Prefix-tuning.
 
 
 These methods will address three aspects:
@@ -14,7 +14,7 @@ These methods will address three aspects:
 
 - **Cost of deployment** – for each fine-tuned downstream model we need to deploy a separate model; however, when using these methods, only a small set of parameters (few MB instead of several GBs) of the pretrained model can do the job. In this case, for each task we only add these extra parameters on top of the pretrained model so pretrained models can be assumed as backbone and these parameters as heads for the model on different tasks.
 
-- **Catastrophic forgetting** — these methods also help with forgetting the first task that can happen in fine-tunings.
+- **Catastrophic forgetting** — these methods also help with forgetting the first task that can happen in fine-tuning.
 
 HF [PEFT](https://github.com/huggingface/peft) library provides an easy way of using these methods which we make use of here. Please read more [here](https://huggingface.co/blog/peft).
 
@@ -42,7 +42,7 @@ You can also keep most of the layers frozen and only fine-tune a few layers. The
 
 
 
-In this scenario depending on the model size, you might need to go beyond one GPU, especially if your model does not fit into one GPU for training. In this case Llama 2 7B parameter won't fit into one gpu.
+In this scenario depending on the model size, you might need to go beyond one GPU, especially if your model does not fit into one GPU for training. In this case Meta Llama 3 8B parameter won't fit into one gpu.
 The way you want to think about it is, you would need enough GPU memory to keep model parameters, gradients and optimizer states. Where each of these, depending on the precision you are training, can take up multiple times of your parameter count x precision( depending on if its fp32/ 4 bytes, fp16/2 bytes/ bf16/2 bytes).
 For example AdamW optimizer keeps 2 parameters for each of your parameters and in many cases these are kept in fp32. This implies that depending on how many layers you are training/ unfreezing your GPU memory can grow beyond one GPU.
 
 
@@ -6,9 +6,9 @@ To run fine-tuning on multi-GPUs, we will  make use of two packages:
 
 2. [FSDP](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html) which helps us parallelize the training over multiple GPUs. [More details](LLM_finetuning.md/#2-full-partial-parameter-finetuning).
 
-Given the combination of PEFT and FSDP, we would be able to fine tune a Llama 2 model on multiple GPUs in one node or multi-node.
+Given the combination of PEFT and FSDP, we would be able to fine tune a Meta Llama 3 8B model on multiple GPUs in one node or multi-node.
 
-## Requirements 
+## Requirements
 To run the examples, make sure to install the llama-recipes package and clone the github repository in order to use the provided [`finetuning.py`](../recipes/finetuning/finetuning.py) script with torchrun (See [README.md](../README.md) for details).
 
 **Please note that the llama_recipes package will install PyTorch 2.0.1 version, in case you want to run FSDP + PEFT, please make sure to install PyTorch nightlies.**
@@ -24,7 +24,7 @@ This runs with the `samsum_dataset` for summarization application by default.
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/7B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -34,7 +34,7 @@ The args used in the command above are:
 
 * `--use_peft` boolean flag to enable PEFT methods in the script
 
-* `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`, `prefix`.
+* `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`.
 
 We use `torchrun` here to spawn multiple processes for FSDP.
 
@@ -43,7 +43,7 @@ We use `torchrun` here to spawn multiple processes for FSDP.
 Setting `use_fast_kernels` will enable using of Flash Attention or Xformer memory-efficient kernels based on the hardware being used. This would speed up the fine-tuning job. This has been enabled in `optimum` library from HuggingFace as a one-liner API, please read more [here](https://pytorch.org/blog/out-of-the-box-acceleration/).
 
 ```bash
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/7B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
 ```
 
 ### Fine-tuning using FSDP Only
@@ -52,7 +52,7 @@ If interested in running full parameter finetuning without making use of PEFT me
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
+torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
 
 ```
 
@@ -62,7 +62,7 @@ If you are interested in running full parameter fine-tuning on the 70B model, yo
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
+torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
 
 ```
 
@@ -95,16 +95,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
 ```bash
 # grammer_dataset
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/7B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/7B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/7B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -115,32 +115,48 @@ torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --m
 It lets us specify the training settings for everything from `model_name` to `dataset_name`, `batch_size` and so on. Below is the list of supported settings:
 
 ```python
-
-model_name: str="PATH/to/LLAMA 2/7B"
-enable_fsdp: bool= False
-run_validation: bool=True
-batch_size_training: int=4
-gradient_accumulation_steps: int=1
-num_epochs: int=3
-num_workers_dataloader: int=2
-lr: float=2e-4
-weight_decay: float=0.0
-gamma: float= 0.85
-use_fp16: bool=False
-mixed_precision: bool=True
-val_batch_size: int=4
-dataset = "samsum_dataset" # alpaca_dataset, grammar_dataset
-peft_method: str = "lora" # None , llama_adapter, prefix
-use_peft: bool=False
-output_dir: str = "./ft-output"
-freeze_layers: bool = False
-num_freeze_layers: int = 1
-quantization: bool = False
-save_model: bool = False
-dist_checkpoint_root_folder: str="model_checkpoints"
-dist_checkpoint_folder: str="fine-tuned"
-save_optimizer: bool=False
-
+    model_name: str="PATH/to/Model"
+    tokenizer_name: str=None
+    enable_fsdp: bool=False
+    low_cpu_fsdp: bool=False
+    run_validation: bool=True
+    batch_size_training: int=4
+    batching_strategy: str="packing" #alternative: padding
+    context_length: int=4096
+    gradient_accumulation_steps: int=1
+    gradient_clipping: bool = False
+    gradient_clipping_threshold: float = 1.0
+    num_epochs: int=3
+    max_train_step: int=0
+    max_eval_step: int=0
+    num_workers_dataloader: int=1
+    lr: float=1e-4
+    weight_decay: float=0.0
+    gamma: float= 0.85
+    seed: int=42
+    use_fp16: bool=False
+    mixed_precision: bool=True
+    val_batch_size: int=1
+    dataset = "samsum_dataset"
+    peft_method: str = "lora" # None, llama_adapter (Caution: llama_adapter is currently not supported with FSDP)
+    use_peft: bool=False
+    from_peft_checkpoint: str="" # if not empty and use_peft=True, will load the peft checkpoint and resume the fine-tuning on that checkpoint
+    output_dir: str = "PATH/to/save/PEFT/model"
+    freeze_layers: bool = False
+    num_freeze_layers: int = 1
+    quantization: bool = False
+    one_gpu: bool = False
+    save_model: bool = True
+    dist_checkpoint_root_folder: str="PATH/to/save/FSDP/model" # will be used if using FSDP
+    dist_checkpoint_folder: str="fine-tuned" # will be used if using FSDP
+    save_optimizer: bool=False # will be used if using FSDP
+    use_fast_kernels: bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
+    use_wandb: bool = False # Enable wandb for experient tracking
+    save_metrics: bool = False # saves training metrics to a json file for later plotting
+    flop_counter: bool = False # Enable flop counter to measure model throughput, can not be used with pytorch profiler at the same time.
+    flop_counter_start: int = 3 # The step to start profiling, default is 3, which means after 3 steps of warmup stage, the profiler will start to count flops.
+    use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time.
+    profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
 ```
 
 * [Datasets config file](../src/llama_recipes/configs/datasets.py) provides the available options for datasets.
@@ -167,3 +183,9 @@ save_optimizer: bool=False
 * `fsdp_activation_checkpointing` enables activation checkpoining for FSDP, this saves significant amount of memory with the trade off of recomputing itermediate activations during the backward pass. The saved memory can be re-invested in higher batch sizes to increase the throughput. We recommond you use this option.
 
 * `pure_bf16` it moves the  model to `BFloat16` and if `optimizer` is set to `anyprecision` then optimizer states will be kept in `BFloat16` as well. You can use this option if necessary.
+
+## FLOPS Counting and Pytorch Profiling
+
+To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter.
+
+Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6.  The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy.