Skip to content

Commit 9063c3c

Browse files
authored
Move to ruff and add pre-commit (#42)
* Move to ruff and add pre-commit Signed-off-by: Hemil Desai <[email protected]> * format fix Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> --------- Signed-off-by: Hemil Desai <[email protected]>
1 parent c087ace commit 9063c3c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+295
-356
lines changed

.github/workflows/code-linting.yml

Lines changed: 9 additions & 178 deletions
Original file line numberDiff line numberDiff line change
@@ -11,194 +11,25 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
name: PyLint and flake8 linting
14+
name: Code linting
1515

1616
on:
1717
pull_request:
1818
types: [opened, synchronize, reopened, labeled, unlabeled]
1919
workflow_call:
2020

2121
jobs:
22-
linting:
22+
lint-check:
23+
name: Lint check
2324
runs-on: ubuntu-latest
2425
steps:
25-
- name: Checkout
26+
- name: Checkout repository
2627
uses: actions/checkout@v4
27-
28-
- name: Install uv
29-
uses: astral-sh/setup-uv@v5
30-
with:
31-
version: "0.7.10"
32-
33-
- name: Install dependencies
34-
run: |
35-
uv sync --only-group test
36-
37-
- name: Get changed files
38-
id: changed-files
39-
uses: step-security/[email protected]
4028
with:
41-
files: |
42-
**/*.py
43-
44-
- name: Run PyLint
45-
id: pylint
46-
env:
47-
CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
48-
SKIP_DOCS: ${{ contains(github.event.pull_request.labels.*.name, 'skip-docs') }}
49-
SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
50-
run: |
51-
if [[ -z "$CHANGED_FILES" ]]; then
52-
echo Nothing to lint.
53-
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
54-
exit 0
55-
fi
56-
57-
if [[ $SKIP_DOCS == true ]]; then
58-
ADDITIONAL_PYLINT_ARGS="--disable=C0115,C0116"
59-
else
60-
ADDITIONAL_PYLINT_ARGS=""
61-
fi
62-
63-
if [[ $SKIP_LINTING == true ]]; then
64-
ADDITIONAL_PYLINT_ARGS="--exit-zero"
65-
fi
66-
67-
set +e
68-
uv run --only-group test pylint $ADDITIONAL_PYLINT_ARGS --output "pylintrc.txt" --rcfile ".pylintrc" ${CHANGED_FILES[@]}
69-
echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
29+
submodules: 'recursive'
7030

71-
- name: Run flake8
72-
id: flake8
73-
env:
74-
CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
75-
SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
31+
- name: Check lint
7632
run: |
77-
if [[ -z "$CHANGED_FILES" ]]; then
78-
echo Nothing to lint.
79-
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
80-
exit 0
81-
fi
82-
83-
if [[ $SKIP_LINTING == true ]]; then
84-
ADDITIONAL_FLAKE8_ARGS="--exit-zero"
85-
else
86-
ADDITIONAL_FLAKE8_ARGS=""
87-
fi
88-
89-
set +e
90-
uv run --only-group test flake8 $ADDITIONAL_FLAKE8_ARGS --output "flake8.txt" --config ".flake8" ${CHANGED_FILES[@]}
91-
echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
92-
93-
- name: Run isort
94-
id: isort
95-
env:
96-
CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
97-
SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
98-
run: |
99-
if [[ -z "$CHANGED_FILES" ]]; then
100-
echo Nothing to lint.
101-
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
102-
exit 0
103-
fi
104-
105-
set +e
106-
uv run --only-group test isort --check-only --diff ${CHANGED_FILES[@]} > isort.txt 2>&1
107-
ISORT_EXIT_CODE=$?
108-
109-
if [[ $SKIP_LINTING == true ]]; then
110-
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
111-
else
112-
echo "exit-code=$ISORT_EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
113-
fi
114-
115-
- name: Run black
116-
id: black
117-
env:
118-
CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
119-
SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
120-
run: |
121-
if [[ -z "$CHANGED_FILES" ]]; then
122-
echo Nothing to lint.
123-
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
124-
exit 0
125-
fi
126-
127-
set +e
128-
uv run --only-group test black --check --diff ${CHANGED_FILES[@]} > black.txt 2>&1
129-
BLACK_EXIT_CODE=$?
130-
131-
if [[ $SKIP_LINTING == true ]]; then
132-
echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
133-
else
134-
echo "exit-code=$BLACK_EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
135-
fi
136-
137-
- name: Summary
138-
env:
139-
PYLINT: ${{ steps.pylint.outputs.exit-code == 0 }}
140-
FLAKE8: ${{ steps.flake8.outputs.exit-code == 0 }}
141-
ISORT: ${{ steps.isort.outputs.exit-code == 0 }}
142-
BLACK: ${{ steps.black.outputs.exit-code == 0 }}
143-
SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
144-
run: |
145-
if [[ "$PYLINT" != "true" ]]; then
146-
echo "Pylint output:" | tee -a $GITHUB_STEP_SUMMARY
147-
148-
echo '```' | tee -a $GITHUB_STEP_SUMMARY
149-
cat pylintrc.txt | tee -a $GITHUB_STEP_SUMMARY
150-
echo '```' | tee -a $GITHUB_STEP_SUMMARY
151-
fi
152-
153-
if [[ "$FLAKE8" != "true" ]]; then
154-
echo "Flake8 output:" | tee -a $GITHUB_STEP_SUMMARY
155-
156-
echo '```' | tee -a $GITHUB_STEP_SUMMARY
157-
cat flake8.txt | tee -a $GITHUB_STEP_SUMMARY
158-
echo '```' | tee -a $GITHUB_STEP_SUMMARY
159-
fi
160-
161-
if [[ "$ISORT" != "true" ]]; then
162-
echo "isort output:" | tee -a $GITHUB_STEP_SUMMARY
163-
164-
echo '```' | tee -a $GITHUB_STEP_SUMMARY
165-
cat isort.txt | tee -a $GITHUB_STEP_SUMMARY
166-
echo '```' | tee -a $GITHUB_STEP_SUMMARY
167-
fi
168-
169-
if [[ "$BLACK" != "true" ]]; then
170-
echo "black output:" | tee -a $GITHUB_STEP_SUMMARY
171-
172-
echo '```' | tee -a $GITHUB_STEP_SUMMARY
173-
cat black.txt | tee -a $GITHUB_STEP_SUMMARY
174-
echo '```' | tee -a $GITHUB_STEP_SUMMARY
175-
fi
176-
177-
if [[ "$PYLINT" != "true" || "$FLAKE8" != "true" || "$ISORT" != "true" || "$BLACK" != "true" ]]; then
178-
echo "The following directories got scanned:" | tee -a $GITHUB_STEP_SUMMARY
179-
180-
echo '```' | tee -a $GITHUB_STEP_SUMMARY
181-
echo ${{ steps.filter.outputs.main }} | tee -a $GITHUB_STEP_SUMMARY
182-
echo '```' | tee -a $GITHUB_STEP_SUMMARY
183-
184-
exit 1
185-
fi
186-
187-
Nemo_Linting_Test:
188-
needs: linting
189-
runs-on: ubuntu-latest
190-
if: always()
191-
steps:
192-
- name: Main
193-
env:
194-
RESULTS: ${{ toJson(needs.linting) }}
195-
run: |
196-
RESULT=$(echo "$RESULTS" | jq -r '.result')
197-
198-
if [[ "$RESULT" == "success" ]]; then
199-
echo "All passed."
200-
exit 0
201-
else
202-
echo "Some linting domains failed."
203-
exit 1
204-
fi
33+
pip install pre-commit==3.6.0
34+
pre-commit install
35+
pre-commit run --all-files --show-diff-on-failure --color=always

.pre-commit-config.yaml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
repos:
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v4.4.0
4+
hooks:
5+
- id: end-of-file-fixer
6+
# only include python files
7+
files: \.py$
8+
- id: trailing-whitespace
9+
# only include python files
10+
files: \.py$
11+
12+
- repo: https://github.com/astral-sh/ruff-pre-commit
13+
rev: "v0.9.9" # Use the appropriate version
14+
hooks:
15+
- id: ruff
16+
args: ["--fix"]
17+
- id: ruff
18+
args: ["check", "--select", "I", "--fix"]
19+
- id: ruff-format
20+
21+
- repo: local
22+
hooks:
23+
- id: no-underscore-md
24+
name: "Disallow '_' in Markdown filenames"
25+
language: system
26+
entry: |
27+
bash -c '
28+
# Report the offending files
29+
echo "[pre-commit] ERROR: Found Markdown files with underscores:" >&2
30+
for file in "$@"; do
31+
echo " - $file (use hyphens instead)" >&2
32+
done
33+
exit 1
34+
'
35+
files: '.*\/[^\/]*_[^\/]*\.md$'
36+
exclude: '^\.github/'
37+
types: [file]

nemo_lm/converters/common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from nemo_lm.training.state import GlobalState
3737
from nemo_lm.utils.instantiate_utils import instantiate
3838

39+
3940
if TYPE_CHECKING:
4041
from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
4142

nemo_lm/converters/llama.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from nemo_lm.converters.state_transform import TransformFns, apply_transforms, state_transform
2323
from nemo_lm.models.llama import Llama4Config, Llama31Config, LlamaConfig
2424

25+
2526
if TYPE_CHECKING:
2627
from transformers import LlamaConfig as HFLlamaConfig
2728
from transformers import LlamaForCausalLM

nemo_lm/converters/state_transform.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import torch
2323
from torch import nn
2424

25+
2526
SourceModuleT = TypeVar("SourceModuleT", bound=nn.Module)
2627
TargetModuleT = TypeVar("TargetModuleT", bound=nn.Module)
2728
F = TypeVar("F", bound=Callable[..., Any])
@@ -239,8 +240,8 @@ def scale_weights(ctx):
239240

240241
assert target_orig_dtypes == extract_dtypes(_target.named_parameters()), (
241242
f"dtype mismatch between source and target state dicts. "
242-
f"Left side is { {k: v for k, v in target_orig_dtypes.items() if v!=torch.bfloat16} }, "
243-
f"Right side is { {k: v for k, v in extract_dtypes(_target.named_parameters()).items() if v!=torch.bfloat16} }"
243+
f"Left side is { {k: v for k, v in target_orig_dtypes.items() if v != torch.bfloat16} }, "
244+
f"Right side is { {k: v for k, v in extract_dtypes(_target.named_parameters()).items() if v != torch.bfloat16} }"
244245
)
245246
if hasattr(target, "module") and isinstance(target.module, MegatronModule):
246247
target.module = _target

nemo_lm/data/builders/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-

nemo_lm/data/builders/finetuning_dataset.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from nemo_lm.tokenizers.tokenizer import _HuggingFaceTokenizer
2424
from nemo_lm.utils.common_utils import get_rank_safe, print_rank_0
2525

26+
2627
logger = logging.getLogger(__name__)
2728

2829

nemo_lm/data/builders/hf_dataset.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from nemo_lm.training.config import FinetuningDatasetConfig
3131
from nemo_lm.utils.common_utils import print_rank_0
3232

33+
3334
logger = logging.getLogger(__name__)
3435

3536

nemo_lm/data/datasets/packed_sequence.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from nemo_lm.data.datasets.sft import create_sft_dataset
2424
from nemo_lm.tokenizers.tokenizer import MegatronTokenizer
2525

26+
2627
logger = logging.getLogger(__name__)
2728

2829

@@ -155,18 +156,18 @@ class PackedSequenceSpecs:
155156
def __post_init__(self):
156157
if self.packed_train_data_path is not None:
157158
self.packed_train_data_path = Path(self.packed_train_data_path)
158-
assert (
159-
self.packed_train_data_path.suffix == ".npy"
160-
), f"packed training data file must be a .npy file: {self.packed_train_data_path}"
161-
assert (
162-
self.packed_train_data_path.exists()
163-
), f"packed training data file does not exist: {self.packed_train_data_path}"
159+
assert self.packed_train_data_path.suffix == ".npy", (
160+
f"packed training data file must be a .npy file: {self.packed_train_data_path}"
161+
)
162+
assert self.packed_train_data_path.exists(), (
163+
f"packed training data file does not exist: {self.packed_train_data_path}"
164+
)
164165

165166
if self.packed_val_data_path is not None:
166167
self.packed_val_data_path = Path(self.packed_val_data_path)
167-
assert (
168-
self.packed_val_data_path.suffix == ".npy"
169-
), f"packed validation data file must be a .npy file: {self.packed_val_data_path}"
170-
assert (
171-
self.packed_val_data_path.exists()
172-
), f"packed validation data file does not exist: {self.packed_val_data_path}"
168+
assert self.packed_val_data_path.suffix == ".npy", (
169+
f"packed validation data file must be a .npy file: {self.packed_val_data_path}"
170+
)
171+
assert self.packed_val_data_path.exists(), (
172+
f"packed validation data file does not exist: {self.packed_val_data_path}"
173+
)

nemo_lm/data/datasets/packing_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import numpy as np
2020
from tqdm import tqdm
2121

22+
2223
PACKING_ALGOS = ["first_fit_decreasing", "first_fit_shuffle"]
2324

2425
logger = logging.getLogger(__name__)
@@ -178,7 +179,7 @@ def create_packing_strategy(
178179

179180
logger.debug("Packed sequence lengths:")
180181
logger.debug(packed_seq_lens)
181-
logger.info(f"Packing is {sum(packed_seq_lens)/len(packed_seq_lens)/pack_size*100:.2f}% efficient")
182+
logger.info(f"Packing is {sum(packed_seq_lens) / len(packed_seq_lens) / pack_size * 100:.2f}% efficient")
182183
logger.info(
183184
f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor:.3f} <<<<<"
184185
)

0 commit comments

Comments
 (0)