Skip to content

Commit c29dcb6

Browse files
authored
Merge branch 'main' into batched-exact-dedup
2 parents f94d85a + b96a0d9 commit c29dcb6

File tree

97 files changed

+2172
-1038
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

97 files changed

+2172
-1038
lines changed

.github/actions/test-template/action.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ inputs:
4848
description: "Has Azure credentials"
4949
required: false
5050
default: "false"
51+
HF_TOKEN:
52+
description: "Hugging Face Token"
53+
required: true
5154
PAT:
5255
description: "GitHub Personal Access Token"
5356
required: true
@@ -96,6 +99,8 @@ runs:
9699
97100
- name: Start container
98101
shell: bash
102+
env:
103+
HF_TOKEN: ${{ inputs.HF_TOKEN }}
99104
run: |
100105
MNT_PATH=${{ steps.azure-fileshare.outputs.mnt_path }}
101106
@@ -112,6 +117,7 @@ runs:
112117
-d \
113118
--name nemo_container_${{ github.run_id }} ${ARG[@]} \
114119
--shm-size=64g \
120+
--env HF_TOKEN=${HF_TOKEN} \
115121
--env RUN_ID=${{ github.run_id }} \
116122
--volume $(pwd)/NeMo-Curator:/workspace \
117123
--workdir /workspace \

.github/copy-pr-bot.yaml

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,3 @@
11
enabled: true
2-
additional_trustees:
3-
- ericharper
4-
- ko3n1g
5-
- chtruong814
6-
- thomasdhc
7-
- pablo-garay
8-
- ayushdg
9-
- praateekmahajan
10-
- sarahyurick
11-
- singhva
12-
- VibhuJawa
13-
- arhamm1
14-
- suiyoubi
15-
- abhinavg4
16-
- huvunvidia
17-
additional_vetters:
18-
- ericharper
19-
- ko3n1g
20-
- chtruong814
21-
- thomasdhc
22-
- pablo-garay
23-
- ayushdg
24-
- praateekmahajan
25-
- sarahyurick
26-
- singhva
27-
- VibhuJawa
28-
- arhamm1
29-
- suiyoubi
30-
- abhinavg4
31-
- huvunvidia
322
auto_sync_draft: false
333
auto_sync_ready: true

.github/workflows/cicd-main.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ jobs:
6666
needs: [pre-flight, cicd-wait-in-queue]
6767
runs-on: ${{ matrix.os }}
6868
name: Unit_Test_${{ matrix.folder}}_CPU_python-${{ matrix.python-version }}
69+
env:
70+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
6971
environment: nemo-ci
7072
if: |
7173
(
@@ -148,6 +150,7 @@ jobs:
148150
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
149151
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
150152
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
153+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
151154
PAT: ${{ secrets.PAT }}
152155
timeout: 20
153156
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}

.github/workflows/config/.secrets.baseline

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -180,13 +180,6 @@
180180
"is_verified": false,
181181
"line_number": 33
182182
},
183-
{
184-
"type": "Secret Keyword",
185-
"filename": "docs/curate-text/synthetic/llm-client.md",
186-
"hashed_secret": "2083c49ad8d63838a4d18f1de0c419f06eb464db",
187-
"is_verified": false,
188-
"line_number": 44
189-
},
190183
{
191184
"type": "Secret Keyword",
192185
"filename": "docs/curate-text/synthetic/llm-client.md",
@@ -202,15 +195,6 @@
202195
"line_number": 165
203196
}
204197
],
205-
"docs/curate-text/synthetic/multilingual-qa.md": [
206-
{
207-
"type": "Secret Keyword",
208-
"filename": "docs/curate-text/synthetic/multilingual-qa.md",
209-
"hashed_secret": "2083c49ad8d63838a4d18f1de0c419f06eb464db",
210-
"is_verified": false,
211-
"line_number": 30
212-
}
213-
],
214198
"tests/models/client/test_openai_client.py": [
215199
{
216200
"type": "Secret Keyword",
@@ -248,5 +232,5 @@
248232
}
249233
]
250234
},
251-
"generated_at": "2026-02-11T21:26:53Z"
235+
"generated_at": "2026-02-26T00:35:18Z"
252236
}

CHANGELOG.md

Lines changed: 352 additions & 0 deletions
Large diffs are not rendered by default.

benchmarking/run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ def run_entry(
195195
# Execute command with timeout
196196
logger.info(f"\tRunning command {' '.join(cmd) if isinstance(cmd, list) else cmd}")
197197
started_exec = time.time()
198+
ray_cluster_data = get_ray_cluster_data()
198199
run_data = run_command_with_timeout(
199200
command=cmd,
200201
timeout=entry.timeout_s,
@@ -216,7 +217,6 @@ def run_entry(
216217
"logs_dir": logs_path,
217218
}
218219
)
219-
ray_cluster_data = get_ray_cluster_data()
220220
# script_persisted_data is a dictionary with keys "params" and "metrics"
221221
# "params" will contain everything the script wrote to its params.json file
222222
# "metrics" will contain everything the script wrote to its metrics.json file plus metrics

benchmarking/runner/env_capture.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import pynvml
2626
from loguru import logger
2727
from runner.session import Session
28-
from runner.utils import get_obj_for_json, get_total_memory_bytes, run_shm_size_check
28+
from runner.utils import get_obj_for_json, get_shm_usage, get_total_memory_bytes
2929

3030

3131
def dump_env(session_obj: Session, output_path: Path) -> dict[str, Any]:
@@ -66,7 +66,8 @@ def get_env() -> dict[str, Any]:
6666

6767
git_commit_string = get_git_commit_string()
6868
cuda_visible_devices = get_gpu_info_string()
69-
shm_size_bytes, _ = run_shm_size_check(human_readable=False)
69+
shm = get_shm_usage()
70+
shm_size_bytes = shm.get("total_bytes")
7071

7172
# The image digest is not known at image build time and is not available inside the
7273
# container, so it must be passed in when the container is run.

benchmarking/runner/ray_cluster.py

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
import ray
2424
from loguru import logger
25-
from runner.utils import run_shm_size_check
25+
from runner.utils import get_shm_usage
2626

2727
from nemo_curator.core.client import RayClient
2828
from nemo_curator.core.utils import check_ray_responsive
@@ -31,6 +31,19 @@
3131
ray_client_start_poll_interval_s = 0.5
3232

3333

34+
_RAY_CLEANUP_WAIT_S = 10
35+
36+
37+
def _wait_for_ray_cleanup() -> None:
38+
"""Wait for Ray child processes to exit and /dev/shm segments to release after stopping a cluster."""
39+
logger.info(f"Waiting {_RAY_CLEANUP_WAIT_S}s for Ray to clean up child processes and release /dev/shm...")
40+
time.sleep(_RAY_CLEANUP_WAIT_S)
41+
42+
shm = get_shm_usage()
43+
if shm["summary"]:
44+
logger.info(f"SHM usage after cleanup wait: {shm['summary']}")
45+
46+
3447
def setup_ray_cluster_and_env( # noqa: PLR0913
3548
num_cpus: int,
3649
num_gpus: int,
@@ -52,9 +65,14 @@ def setup_ray_cluster_and_env( # noqa: PLR0913
5265
if ray_address_env:
5366
logger.warning(f"RAY_ADDRESS already set in environment: {ray_address_env}")
5467

68+
shm = get_shm_usage()
69+
if shm["summary"]:
70+
logger.info(f"SHM usage before Ray cluster setup: {shm['summary']}")
71+
5572
responsive = False
5673
retries = 0
5774
max_retries = 5
75+
client = None
5876
while not responsive and retries < max_retries:
5977
logger.info(f"Starting Ray cluster (attempt {retries + 1} of {max_retries})...")
6078

@@ -73,14 +91,23 @@ def setup_ray_cluster_and_env( # noqa: PLR0913
7391
ray_stdouterr_capture_file=ray_stdouterr_capture_file,
7492
object_store_memory=object_store_size,
7593
)
76-
client.start()
7794

78-
_ensure_ray_client_process_started(client, ray_client_start_timeout_s, ray_client_start_poll_interval_s)
79-
responsive = check_ray_responsive()
80-
run_shm_size_check(human_readable=True)
95+
try:
96+
client.start()
97+
_ensure_ray_client_process_started(client, ray_client_start_timeout_s, ray_client_start_poll_interval_s)
98+
responsive = True
99+
except Exception:
100+
logger.exception(f"Ray cluster start failed on attempt {retries + 1}")
101+
responsive = False
102+
81103
if not responsive:
82-
logger.info("Ray cluster did not become responsive in time, stopping client and retrying...")
83-
client.stop()
104+
logger.info("Ray cluster did not become responsive, cleaning up before retry...")
105+
try:
106+
client.stop()
107+
except Exception:
108+
logger.exception("Failed to stop client during retry cleanup")
109+
os.environ.pop("RAY_ADDRESS", None)
110+
_wait_for_ray_cleanup()
84111
retries += 1
85112

86113
if not responsive:
@@ -105,6 +132,10 @@ def teardown_ray_cluster_and_env(
105132
ray_client.stop()
106133
except Exception:
107134
logger.exception("Failed to stop Ray client")
135+
136+
# Wait for Ray child processes to exit and /dev/shm to release
137+
_wait_for_ray_cleanup()
138+
108139
# Copy debugging artifacts and clean up temp directory
109140
try:
110141
_copy_ray_debug_artifacts(ray_temp_path, ray_cluster_path)
@@ -114,12 +145,18 @@ def teardown_ray_cluster_and_env(
114145

115146

116147
def get_ray_cluster_data() -> dict[str, Any]:
117-
"""Get resource data from the Ray cluster."""
118-
ray.init(ignore_reinit_error=True)
119-
time.sleep(0.2) # ray.available_resources() returns might have a lag
120-
ray_data = ray.cluster_resources()
121-
ray.shutdown()
122-
return ray_data
148+
"""Get resource data from the Ray cluster.
149+
150+
If the cluster is not responsive (e.g. crashed due to OOM), returns an empty dict
151+
instead of connecting — ray.init() on a dead cluster fatally terminates the process
152+
via Ray's C++ core worker.
153+
"""
154+
if not check_ray_responsive():
155+
logger.warning("Ray cluster is not responsive, skipping cluster data collection")
156+
return {}
157+
with ray.init(ignore_reinit_error=True):
158+
time.sleep(0.2) # ray.available_resources() returns might have a lag
159+
return ray.cluster_resources()
123160

124161

125162
def _ensure_ray_client_process_started(client: RayClient, timeout_s: int, poll_interval_s: float) -> None:

benchmarking/runner/utils.py

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
import os
1616
import re
17-
import subprocess
17+
import shutil
1818
from pathlib import Path
1919
from typing import Any
2020

@@ -138,39 +138,35 @@ def read_int_from_file(path: str) -> int | None:
138138
return os.sysconf("SC_PHYS_PAGES") * os.sysconf("SC_PAGE_SIZE")
139139

140140

141-
def run_shm_size_check(human_readable: bool = False) -> tuple[int | None, str | None]:
141+
def get_shm_usage() -> dict[str, int | str | None]:
142142
"""
143-
Run the appropriate "df" command to check the size of the system shared memory space.
143+
Get structured /dev/shm usage data using shutil.disk_usage.
144+
145+
Returns a dict with keys:
146+
total_bytes, used_bytes, available_bytes: int or None
147+
summary: human-readable string summarizing usage
144148
"""
145-
command = ["df", "-h", "/dev/shm"] if human_readable else ["df", "--block-size=1", "/dev/shm"] # noqa: S108
146-
command_str = " ".join(command)
147-
result = None
149+
result_dict: dict[str, int | str | None] = {
150+
"total_bytes": None,
151+
"used_bytes": None,
152+
"available_bytes": None,
153+
"summary": None,
154+
}
148155
try:
149-
result = subprocess.run( # noqa: S603
150-
command,
151-
check=True,
152-
capture_output=True,
153-
text=True,
154-
)
155-
logger.debug(f"`{command_str}` output:\n{result.stdout}")
156-
except subprocess.CalledProcessError as df_exc:
157-
logger.warning(f"Could not run `{command_str}`: {df_exc}")
158-
159-
# Extract the size from the last line of the output
160-
if result is not None:
161-
output = result.stdout
162-
line = output.strip().split("\n")[-1]
163-
try:
164-
size = line.split()[1] # Size is the second column
165-
# Convert to a real number if not meant for simply reading by humans
166-
if not human_readable:
167-
size = int(size)
168-
except (ValueError, IndexError):
169-
logger.warning(f"Could not parse size from `{command_str}` output line: {line}")
170-
size = None
171-
return (size, output)
172-
else:
173-
return (None, None)
156+
usage = shutil.disk_usage("/dev/shm") # noqa: S108
157+
except OSError as exc:
158+
logger.warning(f"Could not get /dev/shm usage: {exc}")
159+
return result_dict
160+
161+
result_dict["total_bytes"] = usage.total
162+
result_dict["used_bytes"] = usage.used
163+
result_dict["available_bytes"] = usage.free
164+
result_dict["summary"] = (
165+
f"/dev/shm: {human_readable_bytes_repr(usage.used)} used / " # noqa: S108
166+
f"{human_readable_bytes_repr(usage.total)} total "
167+
f"({human_readable_bytes_repr(usage.free)} available)"
168+
)
169+
return result_dict
174170

175171

176172
def human_readable_bytes_repr(size: int) -> str:

benchmarking/scripts/arxiv_e2e_pipeline_benchmark.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,17 +49,16 @@
4949
from nemo_curator.stages.text.download.base import URLGenerator
5050
from nemo_curator.stages.text.download.base.iterator import DocumentIterateExtractStage
5151
from nemo_curator.stages.text.download.base.url_generation import URLGenerationStage
52-
from nemo_curator.stages.text.filters import (
53-
FastTextLangId,
52+
from nemo_curator.stages.text.filters import ScoreFilter
53+
from nemo_curator.stages.text.filters.fasttext import FastTextLangId
54+
from nemo_curator.stages.text.filters.heuristic import (
5455
PunctuationFilter,
55-
RepeatedLinesFilter,
56-
RepeatingTopNGramsFilter,
5756
UrlsFilter,
5857
WordCountFilter,
5958
)
59+
from nemo_curator.stages.text.filters.heuristic.repetition import RepeatedLinesFilter, RepeatingTopNGramsFilter
6060
from nemo_curator.stages.text.io.writer import JsonlWriter, ParquetWriter
6161
from nemo_curator.stages.text.modules.add_id import AddId
62-
from nemo_curator.stages.text.modules.score_filter import ScoreFilter
6362
from nemo_curator.tasks import DocumentBatch, _EmptyTask
6463
from nemo_curator.tasks.utils import TaskPerfUtils
6564

0 commit comments

Comments
 (0)