remove more internvid content

lbliii · lbliii · commit e5ca35d38004 · 2026-02-11T12:51:57.000-05:00
Signed-off-by: Lawrence Lane &lt;llane@nvidia.com&gt;
diff --git a/docs/curate-video/process-data/dedup.md b/docs/curate-video/process-data/dedup.md
@@ -15,7 +15,7 @@ modality: "video-only"
 Use clip-level embeddings to identify near-duplicate video clips so your dataset remains compact, diverse, and efficient to train on.
 
 ## Before You Start
-- Make sure you have embeddings which are written by the [`ClipWriterStage`](video-save-export) under `iv2_embd_parquet/` or `ce1_embd_parquet/`. For a runnable workflow, refer to the [Split and Remove Duplicates Workflow](video-tutorials-split-dedup). The embeddings must be in parquet files containing the columns `id` and `embedding`.
+- Make sure you have embeddings which are written by the [`ClipWriterStage`](video-save-export) under `ce1_embd_parquet/`. For a runnable workflow, refer to the [Split and Remove Duplicates Workflow](video-tutorials-split-dedup). The embeddings must be in parquet files containing the columns `id` and `embedding`.
 - Verify local paths or configure S3-compatible credentials. Provide `storage_options` in read/write keyword arguments when reading or writing cloud paths.
 
 
@@ -24,7 +24,7 @@ Use clip-level embeddings to identify near-duplicate video clips so your dataset
 Duplicate identification operates on clip-level embeddings produced during processing:
 
 1. **Inputs**
-   - Parquet batches from `ClipWriterStage` under `iv2_embd_parquet/` or `ce1_embd_parquet/`
+   - Parquet batches from `ClipWriterStage` under `ce1_embd_parquet/`
    - Columns: `id`, `embedding`
 
 2. **Outputs**
@@ -50,7 +50,7 @@ from nemo_curator.stages.deduplication.semantic.ranking import RankingStrategy
 from nemo_curator.backends.xenna import XennaExecutor
 
 workflow = SemanticDeduplicationWorkflow(
-    input_path="/path/to/embeddings/",  # e.g., iv2_embd_parquet/ or ce1_embd_parquet/
+    input_path="/path/to/embeddings/",  # e.g., ce1_embd_parquet/
     output_path="/path/to/duplicates/",
     cache_path="/path/to/cache/",  # Optional: defaults to output_path
     n_clusters=1000,
diff --git a/docs/curate-video/process-data/index.md b/docs/curate-video/process-data/index.md
@@ -124,7 +124,7 @@ pipeline.add_stage(
 )
 ```
 
-Path helpers are available to resolve common locations (such as `clips/`, `filtered_clips/`, `previews/`, `metas/v0/`, and `iv2_embd_parquet/`).
+Path helpers are available to resolve common locations (such as `clips/`, `filtered_clips/`, `previews/`, `metas/v0/`, and `ce1_embd_parquet/`).
 
 ```{toctree}
 :maxdepth: 2
diff --git a/docs/curate-video/save-export.md b/docs/curate-video/save-export.md
@@ -95,8 +95,8 @@ The writer produces these directories under `output_path`:
 - `filtered_clips/`: Media for filtered-out clips.
 - `previews/`: Preview images (`.webp`).
 - `metas/v0/`: Per-clip metadata (`.json`).
-- `iv2_embd/`, `ce1_embd/`: Per-clip embeddings (`.pickle`).
-- `iv2_embd_parquet/`, `ce1_embd_parquet/`: Parquet batches with columns `id` and `embedding`.
+- `ce1_embd/`: Per-clip embeddings (`.pickle`).
+- `ce1_embd_parquet/`: Parquet batches with columns `id` and `embedding`.
 - `processed_videos/`, `processed_clip_chunks/`: Video-level metadata and per-chunk statistics.
 
 ### Per-Clip Metadata
@@ -132,8 +132,8 @@ Each clip writes a JSON file under `metas/v0/` with clip- and window-level field
 
 ### Embeddings and Parquet outputs
 
-- When embeddings exist, the stage writes per-clip `.pickle` files under `iv2_embd/` or `ce1_embd/`.
-- The stage also batches embeddings per clip chunk into Parquet files under `iv2_embd_parquet/` or `ce1_embd_parquet/` with columns `id` and `embedding` and writes those files to disk.
+- When embeddings exist, the stage writes per-clip `.pickle` files under `ce1_embd/`.
+- The stage also batches embeddings per clip chunk into Parquet files under `ce1_embd_parquet/` with columns `id` and `embedding` and writes those files to disk.
 
 ## Helpers
 
@@ -150,7 +150,6 @@ clips_dir = ClipWriterStage.get_output_path_clips(OUT)
 filtered_clips_dir = ClipWriterStage.get_output_path_clips(OUT, filtered=True)
 previews_dir = ClipWriterStage.get_output_path_previews(OUT)
 metas_dir = ClipWriterStage.get_output_path_metas(OUT, "v0")
-iv2_parquet_dir = ClipWriterStage.get_output_path_iv2_embd_parquet(OUT)
 ce1_parquet_dir = ClipWriterStage.get_output_path_ce1_embd_parquet(OUT)
 processed_videos_dir = ClipWriterStage.get_output_path_processed_videos(OUT)
 processed_chunks_dir = ClipWriterStage.get_output_path_processed_clip_chunks(OUT)
diff --git a/docs/curate-video/tutorials/split-dedup.md b/docs/curate-video/tutorials/split-dedup.md
@@ -49,7 +49,7 @@ Writer-related flags you can add:
   --dry-run                   # Write nothing; validate only
 ```
 
-The pipeline writes embeddings under `$OUT_DIR/iv2_embd_parquet/` (or `ce1_embd_parquet/` if you use Cosmos-Embed1).
+The pipeline writes embeddings under `$OUT_DIR/ce1_embd_parquet/` when using Cosmos-Embed1.
 
 ### Embedding Format Example
 
@@ -64,7 +64,7 @@ The pipeline writes embeddings to Parquet with two columns:
 
 ```text
 $OUT_DIR/
-  iv2_embd_parquet/
+  ce1_embd_parquet/
     1a2b3c4d-....parquet
     5e6f7g8h-....parquet
 ```
@@ -93,7 +93,7 @@ embedding: list<float32>  # length = 768 for Cosmos-Embed1
 ```python
 import pyarrow.parquet as pq
 
-table = pq.read_table(f"{OUT_DIR}/iv2_embd_parquet")
+table = pq.read_table(f"{OUT_DIR}/ce1_embd_parquet")
 df = table.to_pandas()
 print(df.head())  # columns: id, embedding (list[float])
 ```
@@ -113,7 +113,7 @@ from nemo_curator.pipeline import Pipeline
 from nemo_curator.stages.deduplication.semantic.kmeans import KMeansStage
 from nemo_curator.stages.deduplication.semantic.pairwise import PairwiseStage
 
-INPUT_PARQUET = f"{OUT_DIR}/iv2_embd_parquet"  # or s3://...
+INPUT_PARQUET = f"{OUT_DIR}/ce1_embd_parquet"  # or s3://...
 OUTPUT_DIR = f"{OUT_DIR}/semantic_dedup"
 
 pipe = Pipeline(name="video_semantic_dedup", description="K-means + pairwise duplicate removal")
@@ -175,7 +175,7 @@ Video-specific pointers:
 - Use `ClipWriterStage` path helpers to locate outputs: `nemo_curator/stages/video/io/clip_writer.py`.
   - Processed videos: `get_output_path_processed_videos(OUT_DIR)`
   - Clip chunks and previews: `get_output_path_processed_clip_chunks(OUT_DIR)`, `get_output_path_previews(OUT_DIR)`
-  - Embeddings parquet: `${OUT_DIR}/iv2_embd_parquet` (or `${OUT_DIR}/ce1_embd_parquet`)
+  - Embeddings parquet: `${OUT_DIR}/ce1_embd_parquet` (or `${OUT_DIR}/ce1_embd_parquet`)
 
 ### Example Export
 
@@ -188,7 +188,7 @@ from glob import glob
 
 OUT_DIR = os.environ["OUT_DIR"]
 clips_dir = os.path.join(OUT_DIR, "clips")  # adjust if filtering path used
-meta_parquet = os.path.join(OUT_DIR, "iv2_embd_parquet")
+meta_parquet = os.path.join(OUT_DIR, "ce1_embd_parquet")
 
 def iter_clips(path):
     for p in glob(os.path.join(path, "**", "*.mp4"), recursive=True):

Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,7 @@ pipeline.add_stage(`
`124`	`124`	`)`
`125`	`125`	```
`126`	`126`
`127`		-Path helpers are available to resolve common locations (such as `clips/`, `filtered_clips/`, `previews/`, `metas/v0/`, and `iv2_embd_parquet/`).
	`127`	+Path helpers are available to resolve common locations (such as `clips/`, `filtered_clips/`, `previews/`, `metas/v0/`, and `ce1_embd_parquet/`).
`128`	`128`
`129`	`129`	```{toctree}
`130`	`130`	`:maxdepth: 2`