Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 9 additions & 178 deletions .github/workflows/code-linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,194 +11,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: PyLint and flake8 linting
name: Code linting

on:
pull_request:
types: [opened, synchronize, reopened, labeled, unlabeled]
workflow_call:

jobs:
linting:
lint-check:
name: Lint check
runs-on: ubuntu-latest
steps:
- name: Checkout
- name: Checkout repository
uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v5
with:
version: "0.7.10"

- name: Install dependencies
run: |
uv sync --only-group test
- name: Get changed files
id: changed-files
uses: step-security/[email protected]
with:
files: |
**/*.py
- name: Run PyLint
id: pylint
env:
CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
SKIP_DOCS: ${{ contains(github.event.pull_request.labels.*.name, 'skip-docs') }}
SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
run: |
if [[ -z "$CHANGED_FILES" ]]; then
echo Nothing to lint.
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
exit 0
fi
if [[ $SKIP_DOCS == true ]]; then
ADDITIONAL_PYLINT_ARGS="--disable=C0115,C0116"
else
ADDITIONAL_PYLINT_ARGS=""
fi
if [[ $SKIP_LINTING == true ]]; then
ADDITIONAL_PYLINT_ARGS="--exit-zero"
fi
set +e
uv run --only-group test pylint $ADDITIONAL_PYLINT_ARGS --output "pylintrc.txt" --rcfile ".pylintrc" ${CHANGED_FILES[@]}
echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
submodules: 'recursive'

- name: Run flake8
id: flake8
env:
CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
- name: Check lint
run: |
if [[ -z "$CHANGED_FILES" ]]; then
echo Nothing to lint.
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
exit 0
fi
if [[ $SKIP_LINTING == true ]]; then
ADDITIONAL_FLAKE8_ARGS="--exit-zero"
else
ADDITIONAL_FLAKE8_ARGS=""
fi
set +e
uv run --only-group test flake8 $ADDITIONAL_FLAKE8_ARGS --output "flake8.txt" --config ".flake8" ${CHANGED_FILES[@]}
echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
- name: Run isort
id: isort
env:
CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
run: |
if [[ -z "$CHANGED_FILES" ]]; then
echo Nothing to lint.
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
exit 0
fi
set +e
uv run --only-group test isort --check-only --diff ${CHANGED_FILES[@]} > isort.txt 2>&1
ISORT_EXIT_CODE=$?
if [[ $SKIP_LINTING == true ]]; then
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
else
echo "exit-code=$ISORT_EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
fi
- name: Run black
id: black
env:
CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
run: |
if [[ -z "$CHANGED_FILES" ]]; then
echo Nothing to lint.
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
exit 0
fi
set +e
uv run --only-group test black --check --diff ${CHANGED_FILES[@]} > black.txt 2>&1
BLACK_EXIT_CODE=$?
if [[ $SKIP_LINTING == true ]]; then
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
else
echo "exit-code=$BLACK_EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
fi
- name: Summary
env:
PYLINT: ${{ steps.pylint.outputs.exit-code == 0 }}
FLAKE8: ${{ steps.flake8.outputs.exit-code == 0 }}
ISORT: ${{ steps.isort.outputs.exit-code == 0 }}
BLACK: ${{ steps.black.outputs.exit-code == 0 }}
SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
run: |
if [[ "$PYLINT" != "true" ]]; then
echo "Pylint output:" | tee -a $GITHUB_STEP_SUMMARY
echo '```' | tee -a $GITHUB_STEP_SUMMARY
cat pylintrc.txt | tee -a $GITHUB_STEP_SUMMARY
echo '```' | tee -a $GITHUB_STEP_SUMMARY
fi
if [[ "$FLAKE8" != "true" ]]; then
echo "Flake8 output:" | tee -a $GITHUB_STEP_SUMMARY
echo '```' | tee -a $GITHUB_STEP_SUMMARY
cat flake8.txt | tee -a $GITHUB_STEP_SUMMARY
echo '```' | tee -a $GITHUB_STEP_SUMMARY
fi
if [[ "$ISORT" != "true" ]]; then
echo "isort output:" | tee -a $GITHUB_STEP_SUMMARY
echo '```' | tee -a $GITHUB_STEP_SUMMARY
cat isort.txt | tee -a $GITHUB_STEP_SUMMARY
echo '```' | tee -a $GITHUB_STEP_SUMMARY
fi
if [[ "$BLACK" != "true" ]]; then
echo "black output:" | tee -a $GITHUB_STEP_SUMMARY
echo '```' | tee -a $GITHUB_STEP_SUMMARY
cat black.txt | tee -a $GITHUB_STEP_SUMMARY
echo '```' | tee -a $GITHUB_STEP_SUMMARY
fi
if [[ "$PYLINT" != "true" || "$FLAKE8" != "true" || "$ISORT" != "true" || "$BLACK" != "true" ]]; then
echo "The following directories got scanned:" | tee -a $GITHUB_STEP_SUMMARY
echo '```' | tee -a $GITHUB_STEP_SUMMARY
echo ${{ steps.filter.outputs.main }} | tee -a $GITHUB_STEP_SUMMARY
echo '```' | tee -a $GITHUB_STEP_SUMMARY
exit 1
fi
Nemo_Linting_Test:
needs: linting
runs-on: ubuntu-latest
if: always()
steps:
- name: Main
env:
RESULTS: ${{ toJson(needs.linting) }}
run: |
RESULT=$(echo "$RESULTS" | jq -r '.result')
if [[ "$RESULT" == "success" ]]; then
echo "All passed."
exit 0
else
echo "Some linting domains failed."
exit 1
fi
pip install pre-commit==3.6.0
pre-commit install
pre-commit run --all-files --show-diff-on-failure --color=always
37 changes: 37 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: end-of-file-fixer
# only include python files
files: \.py$
- id: trailing-whitespace
# only include python files
files: \.py$

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.9.9" # Use the appropriate version
hooks:
- id: ruff
args: ["--fix"]
- id: ruff
args: ["check", "--select", "I", "--fix"]
- id: ruff-format

- repo: local
hooks:
- id: no-underscore-md
name: "Disallow '_' in Markdown filenames"
language: system
entry: |
bash -c '
# Report the offending files
echo "[pre-commit] ERROR: Found Markdown files with underscores:" >&2
for file in "$@"; do
echo " - $file (use hyphens instead)" >&2
done
exit 1
'
files: '.*\/[^\/]*_[^\/]*\.md$'
exclude: '^\.github/'
types: [file]
1 change: 1 addition & 0 deletions nemo_lm/converters/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from nemo_lm.training.state import GlobalState
from nemo_lm.utils.instantiate_utils import instantiate


if TYPE_CHECKING:
from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig

Expand Down
1 change: 1 addition & 0 deletions nemo_lm/converters/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from nemo_lm.converters.state_transform import TransformFns, apply_transforms, state_transform
from nemo_lm.models.llama import Llama4Config, Llama31Config, LlamaConfig


if TYPE_CHECKING:
from transformers import LlamaConfig as HFLlamaConfig
from transformers import LlamaForCausalLM
Expand Down
5 changes: 3 additions & 2 deletions nemo_lm/converters/state_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import torch
from torch import nn


SourceModuleT = TypeVar("SourceModuleT", bound=nn.Module)
TargetModuleT = TypeVar("TargetModuleT", bound=nn.Module)
F = TypeVar("F", bound=Callable[..., Any])
Expand Down Expand Up @@ -239,8 +240,8 @@ def scale_weights(ctx):

assert target_orig_dtypes == extract_dtypes(_target.named_parameters()), (
f"dtype mismatch between source and target state dicts. "
f"Left side is { {k: v for k, v in target_orig_dtypes.items() if v!=torch.bfloat16} }, "
f"Right side is { {k: v for k, v in extract_dtypes(_target.named_parameters()).items() if v!=torch.bfloat16} }"
f"Left side is { {k: v for k, v in target_orig_dtypes.items() if v != torch.bfloat16} }, "
f"Right side is { {k: v for k, v in extract_dtypes(_target.named_parameters()).items() if v != torch.bfloat16} }"
)
if hasattr(target, "module") and isinstance(target.module, MegatronModule):
target.module = _target
Expand Down
1 change: 0 additions & 1 deletion nemo_lm/data/builders/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@

1 change: 1 addition & 0 deletions nemo_lm/data/builders/finetuning_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from nemo_lm.tokenizers.tokenizer import _HuggingFaceTokenizer
from nemo_lm.utils.common_utils import get_rank_safe, print_rank_0


logger = logging.getLogger(__name__)


Expand Down
1 change: 1 addition & 0 deletions nemo_lm/data/builders/hf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from nemo_lm.training.config import FinetuningDatasetConfig
from nemo_lm.utils.common_utils import print_rank_0


logger = logging.getLogger(__name__)


Expand Down
25 changes: 13 additions & 12 deletions nemo_lm/data/datasets/packed_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from nemo_lm.data.datasets.sft import create_sft_dataset
from nemo_lm.tokenizers.tokenizer import MegatronTokenizer


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -155,18 +156,18 @@ class PackedSequenceSpecs:
def __post_init__(self):
if self.packed_train_data_path is not None:
self.packed_train_data_path = Path(self.packed_train_data_path)
assert (
self.packed_train_data_path.suffix == ".npy"
), f"packed training data file must be a .npy file: {self.packed_train_data_path}"
assert (
self.packed_train_data_path.exists()
), f"packed training data file does not exist: {self.packed_train_data_path}"
assert self.packed_train_data_path.suffix == ".npy", (
f"packed training data file must be a .npy file: {self.packed_train_data_path}"
)
assert self.packed_train_data_path.exists(), (
f"packed training data file does not exist: {self.packed_train_data_path}"
)

if self.packed_val_data_path is not None:
self.packed_val_data_path = Path(self.packed_val_data_path)
assert (
self.packed_val_data_path.suffix == ".npy"
), f"packed validation data file must be a .npy file: {self.packed_val_data_path}"
assert (
self.packed_val_data_path.exists()
), f"packed validation data file does not exist: {self.packed_val_data_path}"
assert self.packed_val_data_path.suffix == ".npy", (
f"packed validation data file must be a .npy file: {self.packed_val_data_path}"
)
assert self.packed_val_data_path.exists(), (
f"packed validation data file does not exist: {self.packed_val_data_path}"
)
3 changes: 2 additions & 1 deletion nemo_lm/data/datasets/packing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import numpy as np
from tqdm import tqdm


PACKING_ALGOS = ["first_fit_decreasing", "first_fit_shuffle"]

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -178,7 +179,7 @@ def create_packing_strategy(

logger.debug("Packed sequence lengths:")
logger.debug(packed_seq_lens)
logger.info(f"Packing is {sum(packed_seq_lens)/len(packed_seq_lens)/pack_size*100:.2f}% efficient")
logger.info(f"Packing is {sum(packed_seq_lens) / len(packed_seq_lens) / pack_size * 100:.2f}% efficient")
logger.info(
f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor:.3f} <<<<<"
)
Expand Down
Loading