NVIDIA-NeMo
diff --git a/‎.github/workflows/code-linting.yml‎
Lines changed: 9 additions & 178 deletions b/‎.github/workflows/code-linting.yml‎
Lines changed: 9 additions & 178 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 37 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎nemo_lm/converters/common.py‎
Lines changed: 1 addition & 0 deletions b/‎nemo_lm/converters/common.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nemo_lm/converters/llama.py‎
Lines changed: 1 addition & 0 deletions b/‎nemo_lm/converters/llama.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nemo_lm/converters/state_transform.py‎
Lines changed: 3 additions & 2 deletions b/‎nemo_lm/converters/state_transform.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎nemo_lm/data/builders/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎nemo_lm/data/builders/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎nemo_lm/data/builders/finetuning_dataset.py‎
Lines changed: 1 addition & 0 deletions b/‎nemo_lm/data/builders/finetuning_dataset.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nemo_lm/data/builders/hf_dataset.py‎
Lines changed: 1 addition & 0 deletions b/‎nemo_lm/data/builders/hf_dataset.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nemo_lm/data/datasets/packed_sequence.py‎
Lines changed: 13 additions & 12 deletions b/‎nemo_lm/data/datasets/packed_sequence.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎nemo_lm/data/datasets/packing_utils.py‎
Lines changed: 2 additions & 1 deletion b/‎nemo_lm/data/datasets/packing_utils.py‎
Lines changed: 2 additions & 1 deletion
@@ -11,194 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-name: PyLint and flake8 linting
+name: Code linting
 
 on:
   pull_request:
     types: [opened, synchronize, reopened, labeled, unlabeled]
   workflow_call:
 
 jobs:
-  linting:
+  lint-check:
+    name: Lint check
     runs-on: ubuntu-latest
     steps:
-      - name: Checkout
+      - name: Checkout repository
         uses: actions/checkout@v4
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v5
-        with:
-          version: "0.7.10"
-
-      - name: Install dependencies
-        run: |
-          uv sync --only-group test
-
-      - name: Get changed files
-        id: changed-files
-        uses: step-security/[email protected]
         with:
-          files: |
-            **/*.py
-
-      - name: Run PyLint
-        id: pylint
-        env:
-          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
-          SKIP_DOCS: ${{ contains(github.event.pull_request.labels.*.name, 'skip-docs') }}
-          SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
-        run: |
-          if [[ -z "$CHANGED_FILES" ]]; then
-            echo Nothing to lint.
-            echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
-            exit 0
-          fi
-
-          if [[ $SKIP_DOCS == true ]]; then
-            ADDITIONAL_PYLINT_ARGS="--disable=C0115,C0116"
-          else
-            ADDITIONAL_PYLINT_ARGS=""
-          fi
-
-          if [[ $SKIP_LINTING == true ]]; then
-            ADDITIONAL_PYLINT_ARGS="--exit-zero"
-          fi
-
-          set +e
-          uv run --only-group test pylint $ADDITIONAL_PYLINT_ARGS --output "pylintrc.txt" --rcfile ".pylintrc" ${CHANGED_FILES[@]}
-          echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
+          submodules: 'recursive'
 
-      - name: Run flake8
-        id: flake8
-        env:
-          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
-          SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
+      - name: Check lint
         run: |
-          if [[ -z "$CHANGED_FILES" ]]; then
-            echo Nothing to lint.
-            echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
-            exit 0
-          fi
-
-          if [[ $SKIP_LINTING == true ]]; then
-            ADDITIONAL_FLAKE8_ARGS="--exit-zero"
-          else
-            ADDITIONAL_FLAKE8_ARGS=""
-          fi
-
-          set +e
-          uv run --only-group test flake8 $ADDITIONAL_FLAKE8_ARGS --output "flake8.txt" --config ".flake8" ${CHANGED_FILES[@]}
-          echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
-
-      - name: Run isort
-        id: isort
-        env:
-          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
-          SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
-        run: |
-          if [[ -z "$CHANGED_FILES" ]]; then
-            echo Nothing to lint.
-            echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
-            exit 0
-          fi
-
-          set +e
-          uv run --only-group test isort --check-only --diff ${CHANGED_FILES[@]} > isort.txt 2>&1
-          ISORT_EXIT_CODE=$?
-
-          if [[ $SKIP_LINTING == true ]]; then
-            echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
-          else
-            echo "exit-code=$ISORT_EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
-          fi
-
-      - name: Run black
-        id: black
-        env:
-          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
-          SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
-        run: |
-          if [[ -z "$CHANGED_FILES" ]]; then
-            echo Nothing to lint.
-            echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
-            exit 0
-          fi
-
-          set +e
-          uv run --only-group test black --check --diff ${CHANGED_FILES[@]} > black.txt 2>&1
-          BLACK_EXIT_CODE=$?
-
-          if [[ $SKIP_LINTING == true ]]; then
-            echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
-          else
-            echo "exit-code=$BLACK_EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
-          fi
-
-      - name: Summary
-        env:
-          PYLINT: ${{ steps.pylint.outputs.exit-code == 0 }}
-          FLAKE8: ${{ steps.flake8.outputs.exit-code == 0 }}
-          ISORT: ${{ steps.isort.outputs.exit-code == 0 }}
-          BLACK: ${{ steps.black.outputs.exit-code == 0 }}
-          SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
-        run: |
-          if [[ "$PYLINT" != "true" ]]; then
-            echo "Pylint output:" | tee -a $GITHUB_STEP_SUMMARY
-
-            echo '```' | tee -a $GITHUB_STEP_SUMMARY
-            cat pylintrc.txt | tee -a $GITHUB_STEP_SUMMARY
-            echo '```' | tee -a $GITHUB_STEP_SUMMARY
-          fi
-
-          if [[ "$FLAKE8" != "true" ]]; then
-            echo "Flake8 output:" | tee -a $GITHUB_STEP_SUMMARY
-
-            echo '```' | tee -a $GITHUB_STEP_SUMMARY
-            cat flake8.txt | tee -a $GITHUB_STEP_SUMMARY
-            echo '```' | tee -a $GITHUB_STEP_SUMMARY
-          fi
-
-          if [[ "$ISORT" != "true" ]]; then
-            echo "isort output:" | tee -a $GITHUB_STEP_SUMMARY
-
-            echo '```' | tee -a $GITHUB_STEP_SUMMARY
-            cat isort.txt | tee -a $GITHUB_STEP_SUMMARY
-            echo '```' | tee -a $GITHUB_STEP_SUMMARY
-          fi
-
-          if [[ "$BLACK" != "true" ]]; then
-            echo "black output:" | tee -a $GITHUB_STEP_SUMMARY
-
-            echo '```' | tee -a $GITHUB_STEP_SUMMARY
-            cat black.txt | tee -a $GITHUB_STEP_SUMMARY
-            echo '```' | tee -a $GITHUB_STEP_SUMMARY
-          fi
-
-          if [[ "$PYLINT" != "true" || "$FLAKE8" != "true" || "$ISORT" != "true" || "$BLACK" != "true" ]]; then
-            echo "The following directories got scanned:" | tee -a $GITHUB_STEP_SUMMARY
-
-            echo '```' | tee -a $GITHUB_STEP_SUMMARY
-            echo ${{ steps.filter.outputs.main }} | tee -a $GITHUB_STEP_SUMMARY
-            echo '```' | tee -a $GITHUB_STEP_SUMMARY
-
-            exit 1
-          fi
-
-  Nemo_Linting_Test:
-    needs: linting
-    runs-on: ubuntu-latest
-    if: always()
-    steps:
-      - name: Main
-        env:
-          RESULTS: ${{ toJson(needs.linting) }}
-        run: |
-          RESULT=$(echo "$RESULTS" | jq -r '.result')
-
-          if [[ "$RESULT" == "success" ]]; then
-            echo "All passed."
-            exit 0
-          else
-            echo "Some linting domains failed."
-            exit 1
-          fi
+          pip install pre-commit==3.6.0
+          pre-commit install
+          pre-commit run --all-files --show-diff-on-failure --color=always
@@ -0,0 +1,37 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    - id: end-of-file-fixer
+      # only include python files
+      files: \.py$
+    - id: trailing-whitespace
+      # only include python files
+      files: \.py$
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: "v0.9.9" # Use the appropriate version
+    hooks:
+    - id: ruff
+      args: ["--fix"]
+    - id: ruff
+      args: ["check", "--select", "I", "--fix"]
+    - id: ruff-format
+
+  - repo: local
+    hooks:
+      - id: no-underscore-md
+        name: "Disallow '_' in Markdown filenames"
+        language: system
+        entry: |
+          bash -c '
+            # Report the offending files
+            echo "[pre-commit] ERROR: Found Markdown files with underscores:" >&2
+            for file in "$@"; do
+              echo "  - $file (use hyphens instead)" >&2
+            done
+            exit 1
+          '
+        files: '.*\/[^\/]*_[^\/]*\.md$'
+        exclude: '^\.github/'
+        types: [file]
@@ -36,6 +36,7 @@
 from nemo_lm.training.state import GlobalState
 from nemo_lm.utils.instantiate_utils import instantiate
 
+
 if TYPE_CHECKING:
     from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
 
 
@@ -22,6 +22,7 @@
 from nemo_lm.converters.state_transform import TransformFns, apply_transforms, state_transform
 from nemo_lm.models.llama import Llama4Config, Llama31Config, LlamaConfig
 
+
 if TYPE_CHECKING:
     from transformers import LlamaConfig as HFLlamaConfig
     from transformers import LlamaForCausalLM
 
@@ -22,6 +22,7 @@
 import torch
 from torch import nn
 
+
 SourceModuleT = TypeVar("SourceModuleT", bound=nn.Module)
 TargetModuleT = TypeVar("TargetModuleT", bound=nn.Module)
 F = TypeVar("F", bound=Callable[..., Any])
@@ -239,8 +240,8 @@ def scale_weights(ctx):
 
     assert target_orig_dtypes == extract_dtypes(_target.named_parameters()), (
         f"dtype mismatch between source and target state dicts. "
-        f"Left side is { {k: v for k, v in target_orig_dtypes.items() if v!=torch.bfloat16} }, "
-        f"Right side is { {k: v for k, v in extract_dtypes(_target.named_parameters()).items() if v!=torch.bfloat16} }"
+        f"Left side is { {k: v for k, v in target_orig_dtypes.items() if v != torch.bfloat16} }, "
+        f"Right side is { {k: v for k, v in extract_dtypes(_target.named_parameters()).items() if v != torch.bfloat16} }"
     )
     if hasattr(target, "module") and isinstance(target.module, MegatronModule):
         target.module = _target
 
@@ -1 +0,0 @@
-
@@ -23,6 +23,7 @@
 from nemo_lm.tokenizers.tokenizer import _HuggingFaceTokenizer
 from nemo_lm.utils.common_utils import get_rank_safe, print_rank_0
 
+
 logger = logging.getLogger(__name__)
 
 
 
@@ -30,6 +30,7 @@
 from nemo_lm.training.config import FinetuningDatasetConfig
 from nemo_lm.utils.common_utils import print_rank_0
 
+
 logger = logging.getLogger(__name__)
 
 
 
@@ -23,6 +23,7 @@
 from nemo_lm.data.datasets.sft import create_sft_dataset
 from nemo_lm.tokenizers.tokenizer import MegatronTokenizer
 
+
 logger = logging.getLogger(__name__)
 
 
@@ -155,18 +156,18 @@ class PackedSequenceSpecs:
     def __post_init__(self):
         if self.packed_train_data_path is not None:
             self.packed_train_data_path = Path(self.packed_train_data_path)
-            assert (
-                self.packed_train_data_path.suffix == ".npy"
-            ), f"packed training data file must be a .npy file: {self.packed_train_data_path}"
-            assert (
-                self.packed_train_data_path.exists()
-            ), f"packed training data file does not exist: {self.packed_train_data_path}"
+            assert self.packed_train_data_path.suffix == ".npy", (
+                f"packed training data file must be a .npy file: {self.packed_train_data_path}"
+            )
+            assert self.packed_train_data_path.exists(), (
+                f"packed training data file does not exist: {self.packed_train_data_path}"
+            )
 
         if self.packed_val_data_path is not None:
             self.packed_val_data_path = Path(self.packed_val_data_path)
-            assert (
-                self.packed_val_data_path.suffix == ".npy"
-            ), f"packed validation data file must be a .npy file: {self.packed_val_data_path}"
-            assert (
-                self.packed_val_data_path.exists()
-            ), f"packed validation data file does not exist: {self.packed_val_data_path}"
+            assert self.packed_val_data_path.suffix == ".npy", (
+                f"packed validation data file must be a .npy file: {self.packed_val_data_path}"
+            )
+            assert self.packed_val_data_path.exists(), (
+                f"packed validation data file does not exist: {self.packed_val_data_path}"
+            )
@@ -19,6 +19,7 @@
 import numpy as np
 from tqdm import tqdm
 
+
 PACKING_ALGOS = ["first_fit_decreasing", "first_fit_shuffle"]
 
 logger = logging.getLogger(__name__)
@@ -178,7 +179,7 @@ def create_packing_strategy(
 
     logger.debug("Packed sequence lengths:")
     logger.debug(packed_seq_lens)
-    logger.info(f"Packing is {sum(packed_seq_lens)/len(packed_seq_lens)/pack_size*100:.2f}% efficient")
+    logger.info(f"Packing is {sum(packed_seq_lens) / len(packed_seq_lens) / pack_size * 100:.2f}% efficient")
     logger.info(
         f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor:.3f} <<<<<"
     )