Merge pull request #2498 from SamuelMarks:pre-commit-running

Google-ML-Automation · Google-ML-Automation · commit 59b37eb9b1a7 · 2025-11-13T15:24:42.000-08:00
PiperOrigin-RevId: 832031708
diff --git a/.github/workflows/CodeQuality.yml b/.github/workflows/CodeQuality.yml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: Linter
+name: CodeQuality
 
 on:
   pull_request:
@@ -28,22 +28,33 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  cpu:
-    name: "CPU tests"
+  qa:
+    name: "Static code-quality checkers"
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        os: [ubuntu-24.04]
-        python-version: ['3.12']
     steps:
     - uses: actions/checkout@v5
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+
+    - name: Install uv and set the Python version
+      uses: astral-sh/setup-uv@v7
       with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install Dependencies
-      run: |
-        python3 -m pip install --upgrade pip
-        python3 -m pip install pre-commit
+        python-version: '3.12'
+        enable-cache: true
+
+    - name: Set venv
+      run: uv venv --python 3.12 "$GITHUB_WORKSPACE"/venv
+
+    - name: Install `pre-commit`
+      run: . "$GITHUB_WORKSPACE"/venv/bin/activate && uv pip install pre-commit
+
+    - name: Cache pre-commit environments
+      uses: actions/cache@v4
+      with:
+         path: ~/.cache/pre-commit
+         key: pre-commit-${{ hashFiles('.pre-commit-config.yaml') }}
+
     - name: Run pre-commit checks on just the files that have changed
-      run: pre-commit run
+      run: |
+        git fetch origin "$GITHUB_BASE_REF":"$GITHUB_BASE_REF"
+        git branch "$GITHUB_HEAD_REF"
+        . "$GITHUB_WORKSPACE"/venv/bin/activate
+        pre-commit run --from-ref "$GITHUB_BASE_REF" --to-ref "$GITHUB_HEAD_REF"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,7 +9,7 @@ repos:
         args:
           - '-w'
           - '--skip="*.txt,pylintrc,.*,src/MaxText/assets/*"'
-          - '-L ND,nd,sems,TE,ROUGE,rouge,astroid,dout'
+          - '-L ND,nd,sems,TE,ROUGE,rouge,astroid,ags,dout'
           - '.'
         additional_dependencies:
           - tomli
diff --git a/benchmarks/api_server/server_models.py b/benchmarks/api_server/server_models.py
@@ -69,6 +69,7 @@ class CompletionRequest(SamplingParams):
   logprobs: Optional[int] = None
 
   @field_validator("logprobs")
+  @classmethod
   def validate_logprobs(cls, v):
     if v is not None and v < 0:
       raise ValueError("logprobs must be a non-negative integer if provided.")
diff --git a/end_to_end/gpu/te/run_single_node_model_parallel.sh b/end_to_end/gpu/te/run_single_node_model_parallel.sh
@@ -115,13 +115,13 @@ run_and_parse() {
   echo "===== Executing ${test}\t${dp}\t${tpsp}\t${fsdp} ====="
   eval "$cmd" 2>&1 | tee "$stdout"
   # Exclude the warning steps for warning up and last step for tracing
-  ths=$(grep 'Tokens/s/device:' "$stdout" | sed '1,'"${WARMUP_STEPS}"'d;$d' | awk -F'Tokens/s/device: ' '{print $2}' | awk -F',' '{print $1}')
+  std=$(grep 'Tokens/s/device:' "$stdout" | sed '1,'"${WARMUP_STEPS}"'d;$d' | awk -F'Tokens/s/device: ' '{print $2}' | awk -F',' '{print $1}')
 
-  if [ -z "$ths" ]; then
+  if [ -z "$std" ]; then
     mean="NA"
     stddev="NA"
   else
-    mean_stddev=$(echo "$ths" | python3 -c "import sys; import numpy as np
+    mean_stddev=$(echo "$std" | python3 -c "import sys; import numpy as np
 arr = [float(l.strip()) for l in sys.stdin if l.strip()]
 if arr:
   print(f'{np.mean(arr):.2f}\t{np.std(arr, ddof=1):.2f}')
diff --git a/src/MaxText/examples/sft_train_and_evaluate.py b/src/MaxText/examples/sft_train_and_evaluate.py
@@ -83,7 +83,7 @@
 
 from flax import nnx
 
-from MaxText import globals
+from MaxText.globals import MAXTEXT_REPO_ROOT
 from MaxText import max_logging
 from MaxText import max_utils
 from MaxText import pyconfig
@@ -122,10 +122,21 @@
 )
 # Regex to extract the final numerical answer
 MATCH_ANSWER = re.compile(rf"{ANSWER_START}.*?([\d\.\,\$]{{1,}})", flags=re.MULTILINE | re.DOTALL)
-CHAT_TEMPLATE_PATH = f"{globals.MAXTEXT_REPO_ROOT}/src/MaxText/examples/chat_templates/math_qa.json"
+CHAT_TEMPLATE_PATH = os.path.join(MAXTEXT_REPO_ROOT, "src", "MaxText", "examples", "chat_templates", "math_qa.json")
 
 
 def get_test_dataset(config, tokenizer):
+  """Loads and prepares the test dataset from Hugging Face.
+
+  Args:
+    config: The pyconfig object containing run configurations, including
+      `hf_access_token`.
+    tokenizer: The tokenizer for processing the text data.
+
+  Returns:
+    A grain.MapDataset instance for the test split, with prompts and target
+    answers.
+  """
   template_config = instruction_data_processing.load_template_from_file(CHAT_TEMPLATE_PATH)
   dataset = datasets.load_dataset(
       DATASET_NAME,
@@ -159,7 +170,17 @@ def get_test_dataset(config, tokenizer):
 
 
 def evaluate_model(dataset, vllm_rollout, debug=True):
-  """Runs evaluation on the model using vLLM."""
+  """Runs evaluation on the model using vLLM.
+
+  Args:
+    dataset: The dataset to evaluate on.
+    vllm_rollout: The vLLM rollout object for generating responses.
+    debug: If True, prints debug information for each sample.
+
+  Returns:
+    A dictionary containing evaluation scores: 'correct', 'partially_correct',
+    and 'correct_format' percentages.
+  """
   rollout_config = base_rollout.RolloutConfig(
       max_tokens_to_generate=MAX_TOKENS_TO_GENERATE,
       max_prompt_length=MAX_PROMPT_LENGTH,
@@ -201,12 +222,35 @@ def evaluate_model(dataset, vllm_rollout, debug=True):
 
 
 def safe_string_to_float(text):
+  """Cleans a string to make it safely convertible to a float.
+
+  Removes commas, spaces, and dollar signs.
+
+  Args:
+    text: The input string.
+
+  Returns:
+    The cleaned string.
+  """
   text = text.replace(",", "").replace(" ", "")  # converts "2,125" to "2125"
   text = text.replace("$", "")  # converts "$50" to "50"
   return text
 
 
 def score_response(target, prediction, debug=True):
+  """Scores the model's prediction against the target answer.
+
+  It checks for exact correctness, partial correctness (within 10%), and
+  whether the response follows the expected format.
+
+  Args:
+    target: The ground truth answer string.
+    prediction: The model's generated response string.
+    debug: If True, prints exceptions during scoring.
+
+  Returns:
+    A tuple of booleans: (is_correct, is_partially_correct, has_correct_format).
+  """
   is_correct, is_partially_correct, has_correct_format = False, False, False
   extracted_response = guess.group(1) if (guess := MATCH_ANSWER.search(prediction)) is not None else ""
   extracted_response = safe_string_to_float(extracted_response)
@@ -231,6 +275,17 @@ def score_response(target, prediction, debug=True):
 
 
 def create_vllm_rollout(config, model, mesh, tokenizer):
+  """Creates a vLLM rollout engine for text generation.
+
+  Args:
+    config: The pyconfig object containing run configurations.
+    model: The NNX model graph.
+    mesh: The JAX device mesh.
+    tokenizer: The tokenizer.
+
+  Returns:
+    A VllmRollout instance configured for the model and hardware.
+  """
   tunix_model = TunixMaxTextAdapter(model)
   return VllmRollout(
       model=tunix_model,
@@ -245,6 +300,14 @@ def create_vllm_rollout(config, model, mesh, tokenizer):
 
 
 def get_tokenizer(config):
+  """Initializes and returns the tokenizer.
+
+  Args:
+    config: The pyconfig object with `tokenizer_path` and `hf_access_token`.
+
+  Returns:
+    A Hugging Face tokenizer instance.
+  """
   tokenizer = transformers.AutoTokenizer.from_pretrained(
       config.tokenizer_path,
       token=config.hf_access_token,
@@ -253,6 +316,11 @@ def get_tokenizer(config):
 
 
 def train_and_evaluate(config):
+  """Orchestrates the pre-train evaluation, SFT, and post-train evaluation.
+
+  Args:
+    config: The pyconfig object containing all run configurations.
+  """
   tokenizer = get_tokenizer(config)
   test_dataset = get_test_dataset(config, tokenizer)
   test_dataset = test_dataset[:NUM_TEST_SAMPLES]
@@ -261,16 +329,16 @@ def train_and_evaluate(config):
   vllm_rollout = create_vllm_rollout(config, trainer.model, mesh, tokenizer)
 
   # 1. Pre-SFT Evaluation
-  max_logging.log(f"Running Pre-SFT evaluation...")
+  max_logging.log("Running Pre-SFT evaluation...")
   score = evaluate_model(test_dataset, vllm_rollout)
   print("Score for PRE-SFT EVALUATION: ", score)
 
   # 2. SFT Training
-  max_logging.log(f"Starting SFT training...")
+  max_logging.log("Starting SFT training...")
   trainer = sft_trainer.train_model(config, trainer, mesh)
 
   # 3. Post-SFT Evaluation
-  max_logging.log(f"Running Post-SFT evaluation...")
+  max_logging.log("Running Post-SFT evaluation...")
   tunix_model = TunixMaxTextAdapter(trainer.model)
   state = nnx.state(tunix_model)
   vllm_rollout.update_params(state)
diff --git a/src/MaxText/layers/quantizations.py b/src/MaxText/layers/quantizations.py
@@ -797,11 +797,11 @@ def _wrap(self, f, name=None):
       A Flax linen module that wraps the given function.
     """
 
-    import transformer_engine.jax as te  # pylint: disable=import-outside-toplevel # pytype: disable=import-error
+    import transformer_engine.jax  # pylint: disable=import-outside-toplevel # pytype: disable=import-error
 
     fp8_recipe = self._recipe
 
-    class TEWrapper(te.flax.module.TransformerEngineBase):
+    class TEWrapper(transformer_engine.jax.flax.module.TransformerEngineBase):
       """Wrapper module for TransformerEngine quantization."""
 
       def generate_quantizer_set(self, postfix: str = ""):
@@ -820,14 +820,14 @@ def __call__(self, *args, **kwargs):
 
   def dot_general_cls(self, mesh_axes: Tuple[str, ...] = ()):
     """Placeholder for dot_general implementation in subclasses."""
-    import transformer_engine.jax as te  # pylint: disable=import-outside-toplevel # pytype: disable=import-error
+    import transformer_engine.jax  # pylint: disable=import-outside-toplevel # pytype: disable=import-error
 
     def te_dot_general(generate_quantizer_set, x, kernel, dims, **kwargs):
       contracting_dims, batch_dims = dims
       assert batch_dims == ((), ()), "Batch dimensions must be empty for TransformerEngine dot."
 
       quantizer_set = generate_quantizer_set()
-      return te.dense.dense(
+      return transformer_engine.jax.dense.dense(
           x,
           kernel,
           contracting_dims=contracting_dims,
diff --git a/src/MaxText/max_utils.py b/src/MaxText/max_utils.py
@@ -991,21 +991,24 @@ def get_batch_seq_len_for_mode(config, model_mode):
 
   return batch_size, seq_len
 
+
 @contextmanager
 def maybe_get_transformer_engine_context(config):
-  """ Runs a transformer engine context engine manager for GPUs only. """
-  if config.hardware in ['gpu', 'gpu_multiprocess']:
+  """Runs a transformer engine context engine manager for GPUs only."""
+  if config.hardware in ["gpu", "gpu_multiprocess"]:
     with transformer_engine_context():
       yield
   else:
     with dummy_context_manager():
       yield
 
+
 @contextmanager
 def dummy_context_manager():
   """A context manager that does nothing."""
   yield
 
+
 @contextmanager
 def transformer_engine_context():
   """If TransformerEngine is available, this context manager will provide
diff --git a/src/MaxText/train.py b/src/MaxText/train.py
@@ -19,7 +19,6 @@
 # See github.com/google/maxtext/issues/20 for more
 
 from typing import Any, Sequence
-from contextlib import contextmanager
 import datetime
 import functools
 import os
@@ -522,14 +521,13 @@ def initialize(argv: Sequence[str]) -> tuple[pyconfig.HyperParameters, Any, Any]
 def run(config, recorder, diagnostic_config):
   """Run the job given hyperparameters and utilities"""
   with (
-    diagnostic.diagnose(diagnostic_config),
-    maybe_record_goodput(recorder, GoodputEvent.JOB),
-    max_utils.maybe_get_transformer_engine_context(config)
+      diagnostic.diagnose(diagnostic_config),
+      maybe_record_goodput(recorder, GoodputEvent.JOB),
+      max_utils.maybe_get_transformer_engine_context(config),
   ):
     train_loop(config, recorder)
 
 
-
 def main(argv: Sequence[str]) -> None:
   config, recorder, diagnostic_config = initialize(argv)
   run(config, recorder, diagnostic_config)
diff --git a/src/MaxText/utils/ckpt_scripts/dequantize_mxfp4.py b/src/MaxText/utils/ckpt_scripts/dequantize_mxfp4.py
@@ -17,7 +17,8 @@
 Example cmd:
 
 python3 -m MaxText.utils.ckpt_scripts.dequantize_mxfp4 --input-path=<input_path> --output-path=<output_path>
-python3 -m MaxText.utils.ckpt_scripts.dequantize_mxfp4 --input-path=<input_path> --output-path=<output_path> --dtype-str=bf16 --cache-size=2
+python3 -m MaxText.utils.ckpt_scripts.dequantize_mxfp4 --input-path=<input_path> --output-path=<output_path> \
+                                                       --dtype-str=bf16 --cache-size=2
 """
 
 import os
diff --git a/tests/integration_tests/checkpointing_test.py b/tests/integration_tests/checkpointing_test.py
@@ -34,6 +34,20 @@
 
 
 def get_checkpointing_command(run_date, hardware, steps, metrics_file, attention_type, dataset_type, dataset_path):
+  """Generates a command list for a checkpointing test run.
+
+  Args:
+    run_date: The date of the run.
+    hardware: The hardware to run on.
+    steps: The number of steps to run.
+    metrics_file: The file to write metrics to.
+    attention_type: The type of attention to use.
+    dataset_type: The type of dataset to use.
+    dataset_path: The path to the dataset.
+
+  Returns:
+    A list of strings representing the command line arguments.
+  """
   model_params = [
       "base_emb_dim=384",
       "base_num_query_heads=8",
@@ -71,7 +85,12 @@ def get_checkpointing_command(run_date, hardware, steps, metrics_file, attention
 
 
 def check_loss(metrics_file, target):
-  """Asserts over loss values from loaded checkpoint"""
+  """Asserts over loss values from loaded checkpoint.
+
+  Args:
+    metrics_file: The base name of the metrics file.
+    target: The target metric to check in the metrics file.
+  """
   metrics_file_saved = "saved_" + metrics_file
   metrics_file_restored = "restored_" + metrics_file
 
@@ -89,7 +108,12 @@ def check_loss(metrics_file, target):
 
 
 def run_checkpointing(hardware, attention_type):
-  """Tests grain checkpoint determinism."""
+  """Tests checkpointing by saving and restoring a model.
+
+  Args:
+    hardware: The hardware to run on.
+    attention_type: The type of attention to use.
+  """
   run_date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
   grain_command = [
       "grain_worker_count=0",
@@ -127,10 +151,12 @@ def run_checkpointing(hardware, attention_type):
 @pytest.mark.integration_test
 @pytest.mark.tpu_only
 def test_autoselected_attention():
+  """Tests checkpointing with autoselected attention on TPU."""
   run_checkpointing("tpu", "autoselected")
 
 
 @pytest.mark.integration_test
 @pytest.mark.gpu_only
 def test_with_dot_product():
+  """Tests checkpointing with dot_product attention on GPU."""
   run_checkpointing("gpu", "dot_product")