Skip to content

Commit 02816be

Browse files
authored
ci: Re-organize curator dep (#1004)
* Re-organize curator dep Signed-off-by: Dong Hyuk Chang <[email protected]> * Fix syntax error Signed-off-by: Dong Hyuk Chang <[email protected]> * Update uv lock file Signed-off-by: Dong Hyuk Chang <[email protected]> * Update grouping based on feedback Signed-off-by: Dong Hyuk Chang <[email protected]> * Update based on feedback Signed-off-by: Dong Hyuk Chang <[email protected]> * Update pyproject and uv lock Signed-off-by: Dong Hyuk Chang <[email protected]> * Update dockerfile Signed-off-by: Dong Hyuk Chang <[email protected]> * Fix syantx error and add back install build first Signed-off-by: Dong Hyuk Chang <[email protected]> --------- Signed-off-by: Dong Hyuk Chang <[email protected]>
1 parent d413461 commit 02816be

File tree

4 files changed

+267
-103
lines changed

4 files changed

+267
-103
lines changed

.github/workflows/cicd-main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ jobs:
151151
python-version: ${{ matrix.python-version }}
152152
- name: Run tests (CPU)
153153
run: |
154-
uv sync --link-mode copy --locked --extra text --extra audio --extra video --extra internvideo2 --group test
154+
uv sync --link-mode copy --locked --extra audio_cpu --extra text_cpu --extra video_cpu --group test
155155
uv run coverage run --branch --source=nemo_curator -m pytest -v tests -m "not gpu"
156156
157157
- name: Generate report

docker/Dockerfile

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,7 @@ FROM nemo_curator_dep AS nemo_curator
6767
COPY . /opt/Curator
6868
RUN cd /opt/Curator && \
6969
uv sync --locked --only-group build && \
70-
if [ $CURATOR_ENVIRONMENT = "dev" ]; then \
71-
uv sync --link-mode copy --locked --extra all --extra video_nvenc --all-groups; \
72-
else \
73-
uv sync --link-mode copy --locked --extra all --all-groups; \
74-
fi;
70+
uv sync --link-mode copy --locked --extra all --all-groups
7571

7672
COPY <<EOF /opt/venv/env.sh
7773
export UV_PROJECT_ENVIRONMENT=/opt/venv

pyproject.toml

Lines changed: 61 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,38 @@ dependencies = [
5656
]
5757

5858
[project.optional-dependencies]
59-
text = [
59+
cuda12 = ["gpustat", "pynvml"]
60+
61+
# Installs CPU + GPU text curation modules
62+
deduplication_cuda12 = [
63+
"cudf-cu12==25.6.*",
64+
"cugraph-cu12==25.6.*",
65+
"cuml-cu12==25.6.*",
66+
"nx-cugraph-cu12==25.6.*",
67+
"pylibraft-cu12==25.6.*",
68+
"raft-dask-cu12==25.6.*",
69+
"rapidsmpf-cu12==25.6.*",
70+
]
71+
72+
audio_cpu = [
73+
"nemo_toolkit[asr]",
74+
]
75+
audio_cuda12 = [
76+
"nemo_curator[audio_cpu]",
77+
"nemo_curator[cuda12]",
78+
]
79+
80+
image_cpu = []
81+
82+
# NVIDIA DALI (simplified; update the package to match your CUDA version if needed)
83+
image_cuda12 = [
84+
"nemo_curator[cuda12]",
85+
"nemo_curator[deduplication_cuda12]",
86+
"nvidia-dali-cuda120",
87+
]
88+
89+
# Text Curation Dependencies
90+
text_cpu = [
6091
# Download / Extract
6192
"beautifulsoup4",
6293
"justext",
@@ -75,40 +106,13 @@ text = [
75106
"ftfy==6.1.1",
76107
]
77108

78-
# Installs CPU + GPU text curation modules
79-
deduplication_cuda12x = [
80-
"cudf-cu12==25.6.*",
81-
"cugraph-cu12==25.6.*",
82-
"cuml-cu12==25.6.*",
83-
"nx-cugraph-cu12==25.6.*",
84-
"pylibraft-cu12==25.6.*",
85-
"raft-dask-cu12==25.6.*",
86-
"rapidsmpf-cu12==25.6.*",
87-
]
88-
89-
audio = [
90-
"nemo_toolkit[asr]",
91-
]
92-
93-
video = [
94-
"av==13.1.0",
95-
"opencv-python",
96-
"torchvision",
97-
"einops",
98-
"easydict",
99-
]
100-
101-
video_cuda = [
102-
"cvcuda_cu12",
103-
"pycuda",
104-
"vllm==0.9.2; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
105-
"flash-attn<=2.8.3; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
106-
"torch<=2.8.0",
107-
"torchaudio",
109+
text_cuda12 = [
110+
"nemo_curator[cuda12]",
111+
"nemo_curator[deduplication_cuda12]",
112+
"nemo_curator[text_cpu]",
108113
]
109114

110-
video_nvenc = ["PyNvVideoCodec==2.0.2; (platform_machine == 'x86_64' and platform_system != 'Darwin')"]
111-
115+
# Video Curation Dependencies
112116
internvideo2 = [
113117
"av>=13.0.0",
114118
"deepspeed>=0.15.1",
@@ -136,19 +140,33 @@ internvideo2 = [
136140
"wandb>=0.18.3",
137141
]
138142

139-
# NVIDIA DALI (simplified; update the package to match your CUDA version if needed)
140-
dali = [
141-
"nvidia-dali-cuda120",
143+
video_cpu = [
144+
"nemo_curator[internvideo2]",
145+
"av==13.1.0",
146+
"opencv-python",
147+
"torchvision",
148+
"einops",
149+
"easydict",
150+
]
151+
152+
video_cuda12 = [
153+
"nemo_curator[video_cpu]",
154+
"nemo_curator[cuda12]",
155+
"cvcuda_cu12",
156+
"flash-attn<=2.8.3; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
157+
"pycuda",
158+
"PyNvVideoCodec==2.0.2; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
159+
"torch<=2.8.0",
160+
"torchaudio",
161+
"vllm==0.9.2; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
142162
]
143163

164+
# All dependencies
144165
all = [
145-
"nemo_curator[dali]",
146-
"nemo_curator[deduplication_cuda12x]",
147-
"nemo_curator[text]",
148-
"nemo_curator[audio]",
149-
"nemo_curator[internvideo2]",
150-
"nemo_curator[video]",
151-
"nemo_curator[video_cuda]",
166+
"nemo_curator[audio_cuda12]",
167+
"nemo_curator[image_cuda12]",
168+
"nemo_curator[text_cuda12]",
169+
"nemo_curator[video_cuda12]",
152170
]
153171

154172
[dependency-groups]

0 commit comments

Comments
 (0)