NVIDIA-NeMo
diff --git a/‎.github/actions/test-template/action.yml‎
Lines changed: 6 additions & 0 deletions b/‎.github/actions/test-template/action.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.github/copy-pr-bot.yaml‎
Lines changed: 0 additions & 30 deletions b/‎.github/copy-pr-bot.yaml‎
Lines changed: 0 additions & 30 deletions
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/config/.secrets.baseline‎
Lines changed: 1 addition & 17 deletions b/‎.github/workflows/config/.secrets.baseline‎
Lines changed: 1 addition & 17 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 352 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 352 additions & 0 deletions
diff --git a/‎benchmarking/run.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarking/run.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarking/runner/env_capture.py‎
Lines changed: 3 additions & 2 deletions b/‎benchmarking/runner/env_capture.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎benchmarking/runner/ray_cluster.py‎
Lines changed: 50 additions & 13 deletions b/‎benchmarking/runner/ray_cluster.py‎
Lines changed: 50 additions & 13 deletions
diff --git a/‎benchmarking/runner/utils.py‎
Lines changed: 27 additions & 31 deletions b/‎benchmarking/runner/utils.py‎
Lines changed: 27 additions & 31 deletions
diff --git a/‎benchmarking/scripts/arxiv_e2e_pipeline_benchmark.py‎
Lines changed: 4 additions & 5 deletions b/‎benchmarking/scripts/arxiv_e2e_pipeline_benchmark.py‎
Lines changed: 4 additions & 5 deletions
@@ -48,6 +48,9 @@ inputs:
     description: "Has Azure credentials"
     required: false
     default: "false"
+  HF_TOKEN:
+    description: "Hugging Face Token"
+    required: true
   PAT:
     description: "GitHub Personal Access Token"
     required: true
@@ -96,6 +99,8 @@ runs:
 
     - name: Start container
       shell: bash
+      env:
+        HF_TOKEN: ${{ inputs.HF_TOKEN }}
       run: |
         MNT_PATH=${{ steps.azure-fileshare.outputs.mnt_path }}
 
@@ -112,6 +117,7 @@ runs:
           -d \
           --name nemo_container_${{ github.run_id }} ${ARG[@]} \
           --shm-size=64g \
+          --env HF_TOKEN=${HF_TOKEN} \
           --env RUN_ID=${{ github.run_id }} \
           --volume $(pwd)/NeMo-Curator:/workspace \
           --workdir /workspace \
 
@@ -1,33 +1,3 @@
 enabled: true
-additional_trustees:
-  - ericharper
-  - ko3n1g
-  - chtruong814
-  - thomasdhc
-  - pablo-garay
-  - ayushdg
-  - praateekmahajan
-  - sarahyurick
-  - singhva
-  - VibhuJawa
-  - arhamm1
-  - suiyoubi
-  - abhinavg4
-  - huvunvidia
-additional_vetters:
-  - ericharper
-  - ko3n1g
-  - chtruong814
-  - thomasdhc
-  - pablo-garay
-  - ayushdg
-  - praateekmahajan
-  - sarahyurick
-  - singhva
-  - VibhuJawa
-  - arhamm1
-  - suiyoubi
-  - abhinavg4
-  - huvunvidia
 auto_sync_draft: false
 auto_sync_ready: true
@@ -66,6 +66,8 @@ jobs:
     needs: [pre-flight, cicd-wait-in-queue]
     runs-on: ${{ matrix.os }}
     name: Unit_Test_${{ matrix.folder}}_CPU_python-${{ matrix.python-version }}
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     environment: nemo-ci
     if: |
       (
@@ -148,6 +150,7 @@ jobs:
           azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
           azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
           azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PAT: ${{ secrets.PAT }}
           timeout: 20
           test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
 
@@ -180,13 +180,6 @@
         "is_verified": false,
         "line_number": 33
       },
-      {
-        "type": "Secret Keyword",
-        "filename": "docs/curate-text/synthetic/llm-client.md",
-        "hashed_secret": "2083c49ad8d63838a4d18f1de0c419f06eb464db",
-        "is_verified": false,
-        "line_number": 44
-      },
       {
         "type": "Secret Keyword",
         "filename": "docs/curate-text/synthetic/llm-client.md",
@@ -202,15 +195,6 @@
         "line_number": 165
       }
     ],
-    "docs/curate-text/synthetic/multilingual-qa.md": [
-      {
-        "type": "Secret Keyword",
-        "filename": "docs/curate-text/synthetic/multilingual-qa.md",
-        "hashed_secret": "2083c49ad8d63838a4d18f1de0c419f06eb464db",
-        "is_verified": false,
-        "line_number": 30
-      }
-    ],
     "tests/models/client/test_openai_client.py": [
       {
         "type": "Secret Keyword",
@@ -248,5 +232,5 @@
       }
     ]
   },
-  "generated_at": "2026-02-11T21:26:53Z"
+  "generated_at": "2026-02-26T00:35:18Z"
 }
@@ -195,6 +195,7 @@ def run_entry(
         # Execute command with timeout
         logger.info(f"\tRunning command {' '.join(cmd) if isinstance(cmd, list) else cmd}")
         started_exec = time.time()
+        ray_cluster_data = get_ray_cluster_data()
         run_data = run_command_with_timeout(
             command=cmd,
             timeout=entry.timeout_s,
@@ -216,7 +217,6 @@ def run_entry(
                 "logs_dir": logs_path,
             }
         )
-        ray_cluster_data = get_ray_cluster_data()
         # script_persisted_data is a dictionary with keys "params" and "metrics"
         # "params" will contain everything the script wrote to its params.json file
         # "metrics" will contain everything the script wrote to its metrics.json file plus metrics
 
@@ -25,7 +25,7 @@
 import pynvml
 from loguru import logger
 from runner.session import Session
-from runner.utils import get_obj_for_json, get_total_memory_bytes, run_shm_size_check
+from runner.utils import get_obj_for_json, get_shm_usage, get_total_memory_bytes
 
 
 def dump_env(session_obj: Session, output_path: Path) -> dict[str, Any]:
@@ -66,7 +66,8 @@ def get_env() -> dict[str, Any]:
 
     git_commit_string = get_git_commit_string()
     cuda_visible_devices = get_gpu_info_string()
-    shm_size_bytes, _ = run_shm_size_check(human_readable=False)
+    shm = get_shm_usage()
+    shm_size_bytes = shm.get("total_bytes")
 
     # The image digest is not known at image build time and is not available inside the
     # container, so it must be passed in when the container is run.
 
@@ -22,7 +22,7 @@
 
 import ray
 from loguru import logger
-from runner.utils import run_shm_size_check
+from runner.utils import get_shm_usage
 
 from nemo_curator.core.client import RayClient
 from nemo_curator.core.utils import check_ray_responsive
@@ -31,6 +31,19 @@
 ray_client_start_poll_interval_s = 0.5
 
 
+_RAY_CLEANUP_WAIT_S = 10
+
+
+def _wait_for_ray_cleanup() -> None:
+    """Wait for Ray child processes to exit and /dev/shm segments to release after stopping a cluster."""
+    logger.info(f"Waiting {_RAY_CLEANUP_WAIT_S}s for Ray to clean up child processes and release /dev/shm...")
+    time.sleep(_RAY_CLEANUP_WAIT_S)
+
+    shm = get_shm_usage()
+    if shm["summary"]:
+        logger.info(f"SHM usage after cleanup wait: {shm['summary']}")
+
+
 def setup_ray_cluster_and_env(  # noqa: PLR0913
     num_cpus: int,
     num_gpus: int,
@@ -52,9 +65,14 @@ def setup_ray_cluster_and_env(  # noqa: PLR0913
     if ray_address_env:
         logger.warning(f"RAY_ADDRESS already set in environment: {ray_address_env}")
 
+    shm = get_shm_usage()
+    if shm["summary"]:
+        logger.info(f"SHM usage before Ray cluster setup: {shm['summary']}")
+
     responsive = False
     retries = 0
     max_retries = 5
+    client = None
     while not responsive and retries < max_retries:
         logger.info(f"Starting Ray cluster (attempt {retries + 1} of {max_retries})...")
 
@@ -73,14 +91,23 @@ def setup_ray_cluster_and_env(  # noqa: PLR0913
             ray_stdouterr_capture_file=ray_stdouterr_capture_file,
             object_store_memory=object_store_size,
         )
-        client.start()
 
-        _ensure_ray_client_process_started(client, ray_client_start_timeout_s, ray_client_start_poll_interval_s)
-        responsive = check_ray_responsive()
-        run_shm_size_check(human_readable=True)
+        try:
+            client.start()
+            _ensure_ray_client_process_started(client, ray_client_start_timeout_s, ray_client_start_poll_interval_s)
+            responsive = True
+        except Exception:
+            logger.exception(f"Ray cluster start failed on attempt {retries + 1}")
+            responsive = False
+
         if not responsive:
-            logger.info("Ray cluster did not become responsive in time, stopping client and retrying...")
-            client.stop()
+            logger.info("Ray cluster did not become responsive, cleaning up before retry...")
+            try:
+                client.stop()
+            except Exception:
+                logger.exception("Failed to stop client during retry cleanup")
+            os.environ.pop("RAY_ADDRESS", None)
+            _wait_for_ray_cleanup()
             retries += 1
 
     if not responsive:
@@ -105,6 +132,10 @@ def teardown_ray_cluster_and_env(
             ray_client.stop()
         except Exception:
             logger.exception("Failed to stop Ray client")
+
+        # Wait for Ray child processes to exit and /dev/shm to release
+        _wait_for_ray_cleanup()
+
         # Copy debugging artifacts and clean up temp directory
         try:
             _copy_ray_debug_artifacts(ray_temp_path, ray_cluster_path)
@@ -114,12 +145,18 @@ def teardown_ray_cluster_and_env(
 
 
 def get_ray_cluster_data() -> dict[str, Any]:
-    """Get resource data from the Ray cluster."""
-    ray.init(ignore_reinit_error=True)
-    time.sleep(0.2)  # ray.available_resources() returns might have a lag
-    ray_data = ray.cluster_resources()
-    ray.shutdown()
-    return ray_data
+    """Get resource data from the Ray cluster.
+
+    If the cluster is not responsive (e.g. crashed due to OOM), returns an empty dict
+    instead of connecting — ray.init() on a dead cluster fatally terminates the process
+    via Ray's C++ core worker.
+    """
+    if not check_ray_responsive():
+        logger.warning("Ray cluster is not responsive, skipping cluster data collection")
+        return {}
+    with ray.init(ignore_reinit_error=True):
+        time.sleep(0.2)  # ray.available_resources() returns might have a lag
+        return ray.cluster_resources()
 
 
 def _ensure_ray_client_process_started(client: RayClient, timeout_s: int, poll_interval_s: float) -> None:
 
@@ -14,7 +14,7 @@
 
 import os
 import re
-import subprocess
+import shutil
 from pathlib import Path
 from typing import Any
 
@@ -138,39 +138,35 @@ def read_int_from_file(path: str) -> int | None:
     return os.sysconf("SC_PHYS_PAGES") * os.sysconf("SC_PAGE_SIZE")
 
 
-def run_shm_size_check(human_readable: bool = False) -> tuple[int | None, str | None]:
+def get_shm_usage() -> dict[str, int | str | None]:
     """
-    Run the appropriate "df" command to check the size of the system shared memory space.
+    Get structured /dev/shm usage data using shutil.disk_usage.
+
+    Returns a dict with keys:
+        total_bytes, used_bytes, available_bytes: int or None
+        summary: human-readable string summarizing usage
     """
-    command = ["df", "-h", "/dev/shm"] if human_readable else ["df", "--block-size=1", "/dev/shm"]  # noqa: S108
-    command_str = " ".join(command)
-    result = None
+    result_dict: dict[str, int | str | None] = {
+        "total_bytes": None,
+        "used_bytes": None,
+        "available_bytes": None,
+        "summary": None,
+    }
     try:
-        result = subprocess.run(  # noqa: S603
-            command,
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-        logger.debug(f"`{command_str}` output:\n{result.stdout}")
-    except subprocess.CalledProcessError as df_exc:
-        logger.warning(f"Could not run `{command_str}`: {df_exc}")
-
-    # Extract the size from the last line of the output
-    if result is not None:
-        output = result.stdout
-        line = output.strip().split("\n")[-1]
-        try:
-            size = line.split()[1]  # Size is the second column
-            # Convert to a real number if not meant for simply reading by humans
-            if not human_readable:
-                size = int(size)
-        except (ValueError, IndexError):
-            logger.warning(f"Could not parse size from `{command_str}` output line: {line}")
-            size = None
-        return (size, output)
-    else:
-        return (None, None)
+        usage = shutil.disk_usage("/dev/shm")  # noqa: S108
+    except OSError as exc:
+        logger.warning(f"Could not get /dev/shm usage: {exc}")
+        return result_dict
+
+    result_dict["total_bytes"] = usage.total
+    result_dict["used_bytes"] = usage.used
+    result_dict["available_bytes"] = usage.free
+    result_dict["summary"] = (
+        f"/dev/shm: {human_readable_bytes_repr(usage.used)} used / "  # noqa: S108
+        f"{human_readable_bytes_repr(usage.total)} total "
+        f"({human_readable_bytes_repr(usage.free)} available)"
+    )
+    return result_dict
 
 
 def human_readable_bytes_repr(size: int) -> str:
 
@@ -49,17 +49,16 @@
 from nemo_curator.stages.text.download.base import URLGenerator
 from nemo_curator.stages.text.download.base.iterator import DocumentIterateExtractStage
 from nemo_curator.stages.text.download.base.url_generation import URLGenerationStage
-from nemo_curator.stages.text.filters import (
-    FastTextLangId,
+from nemo_curator.stages.text.filters import ScoreFilter
+from nemo_curator.stages.text.filters.fasttext import FastTextLangId
+from nemo_curator.stages.text.filters.heuristic import (
     PunctuationFilter,
-    RepeatedLinesFilter,
-    RepeatingTopNGramsFilter,
     UrlsFilter,
     WordCountFilter,
 )
+from nemo_curator.stages.text.filters.heuristic.repetition import RepeatedLinesFilter, RepeatingTopNGramsFilter
 from nemo_curator.stages.text.io.writer import JsonlWriter, ParquetWriter
 from nemo_curator.stages.text.modules.add_id import AddId
-from nemo_curator.stages.text.modules.score_filter import ScoreFilter
 from nemo_curator.tasks import DocumentBatch, _EmptyTask
 from nemo_curator.tasks.utils import TaskPerfUtils