Azure-Samples
diff --git a/‎.github/workflows/pre-commit.yml‎
Lines changed: 14 additions & 0 deletions b/‎.github/workflows/pre-commit.yml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 21 additions & 1 deletion b/‎.gitignore‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 44 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎apps/whisper_fine_tuning/Makefile‎
Lines changed: 49 additions & 0 deletions b/‎apps/whisper_fine_tuning/Makefile‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎apps/whisper_fine_tuning/README.md‎
Lines changed: 228 additions & 0 deletions b/‎apps/whisper_fine_tuning/README.md‎
Lines changed: 228 additions & 0 deletions
@@ -0,0 +1,14 @@
+name: pre-commit
+on: [pull_request, push]
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Install pre-commit
+        run: python -m pip install --upgrade pre-commit
+      - name: Run pre-commit
+        run: pre-commit run --all-files
@@ -427,4 +427,24 @@ transliteration*
 .venv*
 infra/target*
 .vscode*
-junk/
+junk/
+*.code-workspace
+
+# Machine learning artifacts and cache directories
+mlruns/
+custom_data/
+output_model_dir/
+nemo_rnnt_da/
+training_console.log
+cache*
+.mypy_cache/
+.ruff_cache/
+
+# Local dataset directories
+apps/whisper_fine_tuning/data/
+
+*.DS_Store
+
+data/
+
+predictions_dir/
@@ -0,0 +1,44 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 24.10.0
+    hooks:
+      - id: black
+        language_version: python3.11  # require Python 3.11 or newer
+
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.1.0
+    hooks:
+      - id: ruff
+        args: [--fix, --extend-ignore, E402]   # ruff will auto-fix many issues
+        additional_dependencies: []
+
+  - repo: https://github.com/pre-commit/mirrors-isort
+    rev: v5.10.1
+    hooks:
+      - id: isort
+        args: ["--profile", "black"]
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: check-added-large-files
+
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.6.1              # pick a valid tag you confirmed with git ls-remote
+    hooks:
+      - id: mypy
+        # keep commonly useful flags, then selectively disable error codes reported by mypy
+        args:
+          - --ignore-missing-imports
+          - --disable-error-code=import-untyped
+          - --disable-error-code=call-arg
+          - --disable-error-code=union-attr
+          - --disable-error-code=arg-type
+          - --disable-error-code=used-before-def
+          - --disable-error-code=attr-defined
+        files: \.py$
+        language_version: python3.11
@@ -0,0 +1,49 @@
+export HOME := $(HOME)
+.ONESHELL:
+
+ifeq ($(OS),Windows_NT)
+	SHELL = cmd
+	CONDA_ACTIVATE = call %CONDA_PREFIX%\Scripts\activate.bat
+else
+	SHELL = /bin/bash
+	CONDA_ACTIVATE = source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate
+endif
+
+setup_aml:
+	rm -rf ~/.pyenv
+	curl https://pyenv.run | bash
+	$(HOME)/.pyenv/bin/pyenv --version
+	$(HOME)/.pyenv/bin/pyenv install 3.12 --skip-existing
+	$(HOME)/.pyenv/bin/pyenv local 3.12
+	python --version
+	conda create -n condav0 python=3.12
+	$(CONDA_ACTIVATE) activate condav0
+	conda install -c conda-forge poetry
+	poetry config virtualenvs.create true
+	poetry config virtualenvs.in-project true
+	poetry lock --no-update
+	poetry install
+	conda install pip
+	conda install -c conda-forge "ffmpeg>=5,<7"
+	sudo apt update && sudo apt install -y ffmpeg
+	python -m ipykernel install --user --name condav0 --display-name "condav0"
+
+
+USERPROFILE := $(USERPROFILE)
+CURRENT_DIR := $(shell cd)
+setup_win:
+	if exist %USERPROFILE%\.pyenv rmdir /s /q %USERPROFILE%\.pyenv
+	git clone https://github.com/pyenv-win/pyenv-win.git "%USERPROFILE%\.pyenv"
+	$(USERPROFILE)\.pyenv\pyenv-win\bin\pyenv --version
+	$(USERPROFILE)\.pyenv\pyenv-win\bin\pyenv install 3.12 --skip-existing
+	$(USERPROFILE)\.pyenv\pyenv-win\bin\pyenv local 3.12
+	python --version
+	python -m venv venv
+	echo $(CURRENT_DIR)
+	"$(CURRENT_DIR)/venv/Scripts/activate"
+	pip install poetry
+	poetry config virtualenvs.create true
+	poetry config virtualenvs.in-project true
+	poetry lock
+	poetry install
+	conda install pip
@@ -0,0 +1,228 @@
+# <img src="./docs/img//azure_logo.png" alt="Azure Logo" style="width:30px;height:30px;"/> Fine Tuning Open Source LLM Models - QLora and Lora features implemented
+
+## Overview
+Open-source LLMs are powerful but require fine-tuning for specific tasks like chatbots or content generation. Fine-tuning these models can be expensive due to the need for substantial VRAM. For instance, fully fine-tuning the Llama7B model requires 112GB of VRAM. However, techniques like QLoRA and PEFT can significantly reduce these requirements.
+
+# Whisper Fine-Tuning Pipeline
+
+This repository provides a pipeline for fine-tuning OpenAI's Whisper models on custom audio datasets using LoRA (Low-Rank Adaptation) and PEFT (Parameter-Efficient Fine-Tuning). The solution is designed for scalable training and deployment on Azure Machine Learning.
+
+## Features
+- Fine-tune Whisper models with LoRA and PEFT
+- Custom data loading and preprocessing for speech datasets
+- MLflow integration for experiment tracking and model logging (including all console logs as artifacts)
+- Azure ML job specification for cloud training
+- Modular codebase for easy extension
+
+## Directory Structure
+```
+src/core/
+  config.py              # Training and LoRA configuration classes
+  load_data.py           # Data loading and preprocessing
+  train.py               # Trainer class and training logic
+deployment/
+  job_data.py            # Data preparation script
+  job_train.py           # Training script
+  environment.yml        # Conda environment for Azure ML
+  training_job.yaml      # Azure ML job specification
+notebooks/
+  fine_tuned_usage.ipynb # Example notebook for inference and usage
+data/
+  dataset_silver/        # Example processed dataset
+```
+
+## Setup
+1. **Clone the repository**
+    ```bash
+    git clone <repo-url>
+    cd whisper-fine-tuning
+    ```
+2. **Prepare your dataset**
+    - Place your audio datasets in a directory (e.g., `data/train_raw`, `data/evaluation_raw`).
+    - Ensure datasets are in Hugging Face `datasets` format.
+
+3. **Create the environment**
+
+```bash
+# Use Python 3.12 for the Poetry environment
+poetry env use 3.12
+
+# Activate the virtual environment
+poetry shell
+
+# Install all project dependencies
+poetry install
+```
+
+## Usage
+
+### Data Preparation
+
+### Data Preparation in Create Data
+
+For detailed instructions on dataset preparation, please refer to the README in the create_data directory.
+
+
+
+### Model Training
+
+Run the training pipeline:
+```bash
+python deployment/training/job_train.py --model_name openai/whisper-large-v2 --dataset ./data/silver/dataset --apply_lora True
+```
+
+If you want to load a lightweight model configuration, run the training command below. This command fine-tunes OpenAI's Whisper model with LoRA applied on your specified dataset:
+
+```bash
+python deployment/training/job_train.py --model_name openai/whisper-large-v2 --dataset ./data/silver/dataset --apply_lora True  --experiment_name "whisper-mayoruna-v2" 
+
+```
+
+This approach helps optimize resource usage while enabling effective model adaptation.
+
+
+- All console and logging output will be saved as an MLflow artifact (`training_console.log`).
+
+
+## Model Evaluating
+
+
+```bash
+python src/core/evaluation/evaluation_process.py --is_public_repo False --ckpt_dir "output_model_dir" --temp_ckpt_folder "temp" --eval_datasets data/raw/testing --device 0 --batch_size 16 --output_dir predictions_dir
+```
+
+
+python deployment/training/job_train_nemo.py \
+  --dataset_path data/silver/dataset \
+  --output_dir nemo_rnnt_da/ \
+
+
+python deployment/training/job_train_nemo.py \
+  --dataset_path data/silver/dataset \
+  --output_dir output_model_dir/nemo_rnnt_da \
+  --num_workers 0
+
+
+### Azure ML Training
+1. **Configure Azure ML compute and workspace.**
+2. **Submit the job:**
+    ```bash
+    az ml job create --file deployment/training_job.yaml
+    ```
+
+## Arguments
+
+### `job_data.py`
+- `--model_name`: Hugging Face model name (default: `openai/whisper-small`)
+- `--train_datasets`: List of training dataset paths (required)
+- `--eval_datasets`: List of evaluation dataset paths (optional)
+- `--sampling_rate`: Audio sampling rate (default: 16000)
+- `--num_proc`: Number of parallel jobs for data prep (default: 2)
+- `--output_dir`: Output directory for processed dataset
+
+### `job_train.py`
+- `--model_name`: Hugging Face model name (default: `openai/whisper-small`)
+- `--dataset`: Path to processed dataset (required)
+- `--output_dir`: Output directory for checkpoints (default: `output_model_dir`)
+- `--apply_lora`: Whether to apply LoRA (default: True)
+- `--language`: Language for adaptation (default: Hindi)
+- `--sampling_rate`: Audio sampling rate (default: 16000)
+- `--num_proc`: Number of parallel jobs for data prep (default: 2)
+- `--train_strategy`: Training strategy (`steps` or `epoch`, default: `steps`)
+
+## Customization
+- Modify `src/core/config.py` to adjust training and LoRA parameters.
+- Update `deployment/environment.yml` to add/remove dependencies.
+- Edit `deployment/training_job.yaml` for Azure ML compute, environment, and output settings.
+
+## Outputs
+- Trained model checkpoints in the specified output directory
+- MLflow experiment logs (including all console logs as artifacts)
+- Optionally, model artifacts pushed to Hugging Face Hub
+
+## Inference Example
+
+See `notebooks/fine_tuned_usage.ipynb` for detailed usage.
+To load and use a model logged with MLflow:
+```python
+import mlflow.transformers
+
+model_uri = "models:/whisper-fine-tuned/1"  # or your own model URI
+pipe = mlflow.transformers.load_model(model_uri)
+result = pipe("path/to/audio/file.wav")
+print(result["text"])
+```
+Or, using the pyfunc interface:
+```python
+import mlflow.pyfunc
+import pandas as pd
+
+loaded_model = mlflow.pyfunc.load_model(model_uri)
+df = pd.DataFrame({"inputs": ["path/to/audio/file.wav"]})
+result = loaded_model.predict(df)
+print(result)
+```
+
+
+## Steps to Configure Pre-commit
+
+1. Install pre-commit (if not already installed):
+
+    ```bash
+    pip install pre-commit
+    ```
+
+2. Use the `.pre-commit-config.yaml` file in the root of the repository
+
+3. Install the pre-commit hooks:
+
+    ```bash
+    pre-commit install
+    ```
+
+4. Run the hooks manually on all files to ensure consistency:
+
+    ```bash
+    pre-commit run --all-files
+    ```
+
+Integrating pre-commit into your workflow helps maintain consistent code standards and prevents common issues before they enter your codebase.
+
+
+## Troubleshooting
+- Ensure all dependencies in `environment.yml` are installed.
+- Check dataset format and paths.
+- Review Azure ML compute and environment configuration if running in the cloud.
+
+## License
+This project is licensed under the MIT License.
+
+## Contact
+For questions or support, please open an issue or contact the repository maintainer.
+
+
+## Tips current scenario
+| Parameter                      | Suggested              | Reason                                                                                           |
+|--------------------------------|------------------------|--------------------------------------------------------------------------------------------------|
+| per_device_train_batch_size    | 4–8                    | With only 30 samples, large batches are unnecessary and may cause instability.                   |
+| gradient_accumulation_steps      | 1–4                    | Increase if you want to simulate larger batches without memory overhead.                         |
+| num_train_epochs               | 10–30                  | More epochs help the model learn from limited data. Use early stopping or monitor validation loss. |
+| max_steps                      | Remove or set to -1    | Let training be driven by num_train_epochs to avoid premature stopping.                          |
+| warmup_steps                   | 50–100                 | 20k is too high for such a small dataset.                                                        |
+| learning_rate                  | 1e-5 to 3e-5           | Your current value is reasonable, but monitor for overfitting.                                   |
+| eval_strategy                  | "epoch"                | With few samples, evaluating per epoch is more meaningful.                                      |
+| save_strategy                  | "epoch"                | Align with evaluation to save meaningful checkpoints.                                           |
+| fp16                           | ✅                     | Keep if your hardware supports it.                                                              |
+| gradient_checkpointing         | ✅                     | Helps with memory efficiency.                                                                   |
+| optim                          | "adamw_bnb_8bit"       | Good choice for memory-constrained environments.                                               |
+| load_best_model_at_end         | ✅                     | Helps select the best checkpoint.                                                              |
+| predict_with_generate          | ✅                     | Needed for Whisper’s transcription tasks.                                                      |
+
+---
+
+### 🧠 Additional Tips
+
+- **Use Data Augmentation**: Consider adding noise, pitch shift, or speed variation to artificially expand your dataset.
+- **Freeze Most Layers**: Fine-tune only the final layers or adapters to reduce overfitting.
+- **Use Early Stopping**: Monitor validation loss and stop training when it plateaus.
+- **Log Carefully**: Avoid logging too many parameters to MLflow to prevent the `INVALID_PARAMETER_VALUE` error.