NVIDIA-NeMo
diff --git a/‎benchmarking/Dockerfile‎
Lines changed: 2 additions & 0 deletions b/‎benchmarking/Dockerfile‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarking/nightly-benchmark.yaml‎
Lines changed: 66 additions & 37 deletions b/‎benchmarking/nightly-benchmark.yaml‎
Lines changed: 66 additions & 37 deletions
diff --git a/‎benchmarking/run.py‎
Lines changed: 25 additions & 16 deletions b/‎benchmarking/run.py‎
Lines changed: 25 additions & 16 deletions
@@ -28,11 +28,13 @@ RUN apt-get update \
 
 # Add dependencies for benchmarking to the Curator Python environment
 RUN cd /opt/Curator \
+ && uv sync --extra all \
  && uv add \
     GitPython \
     oauth2client \
     pydrive2 \
     pynvml \
+    pyyaml \
     rich \
   && uv cache prune
 
 
@@ -6,15 +6,30 @@
 # Paths can be used in other values in this file by using their placeholder
 # (e.g. {datasets_path}/my/test/dataset.parquet) and will be resolved to the
 # appropriate path at runtime.
-results_path: /raid/curator-team/nightly/results
-artifacts_path: /raid/curator-team/nightly/artifacts
-datasets_path: /raid
+results_path: /path/where/results/are/stored
+artifacts_path: /path/where/artifacts/are/stored
+datasets_path: /path/to/datasets
 
 datasets:
-  - name: "tinystories_train"
+  - name: "tinystories"
     formats:
     - type: "parquet"
-      path: "{datasets_path}/prospector-lm/clean/tinystories_train_parquet"
+      path: "{datasets_path}/tinystories/parquet_data"
+    - type: "jsonl"
+      path: "{datasets_path}/tinystories/jsonl_data"
+  - name: "commoncrawl"
+    formats:
+    - type: "jsonl"
+      path: "{datasets_path}/commoncrawl/jsonl_data"
+  - name: "commoncrawl_id_map"
+    formats:
+    - type: "json"
+      path: "{datasets_path}/commoncrawl/id_generator.json"
+  - name: "commoncrawl_ids"
+    formats:
+    - type: "parquet"
+      path: "{datasets_path}/commoncrawl/IDs/parquet_data"
+
 
 default_timeout_s: 7200
 
@@ -42,7 +57,7 @@ entries:
     script: domain_classification_benchmark.py
     args: >-
       --executor=ray_data
-      --input-path={dataset:tinystories_train,parquet}
+      --input-path={dataset:tinystories,parquet}
       --dataset-size-gb=10
       --model-inference-batch-size=1024
     timeout_s: 20000
@@ -54,17 +69,17 @@ entries:
       num_cpus: 64
       num_gpus: 4
       enable_object_spilling: false
-    # Optional: Requirements for the benchmark to pass.  These will result in the benchmark being marked as failed if not met.
+    # Additional requirements for the benchmark to pass.  These will result in the benchmark being marked as failed if not met.
     requirements:
       - metric: throughput_docs_per_sec
         min_value: 0.2
 
   - name: domain_classification_xenna
-    enabled: false
+    enabled: true
     script: domain_classification_benchmark.py
     args: >-
       --executor=xenna
-      --input-path={dataset:tinystories_train,parquet}
+      --input-path={dataset:tinystories,parquet}
       --dataset-size-gb=10
       --model-inference-batch-size=1024
     timeout_s: 20000
@@ -74,65 +89,79 @@ entries:
     script: embedding_generation_benchmark.py
     args: >-
       --executor=ray_data
-      --input-path={dataset:tinystories_train,parquet}
+      --input-path={dataset:tinystories,parquet}
       --dataset-size-gb=10
       --model-identifier=sentence-transformers/all-MiniLM-L6-v2
       --model-inference-batch-size=1024
     timeout_s: 20000
     sink_data:
       - name: slack
-        # Additional metrics to include in the Slack report.  These must be present in the metrics.json file generated by the script.
         additional_metrics: ["num_documents_processed", "throughput_docs_per_sec"]
     ray:
       num_cpus: 64
       num_gpus: 4
       enable_object_spilling: false
 
   - name: embedding_generation_xenna
-    enabled: false
+    enabled: true
     script: embedding_generation_benchmark.py
     args: >-
       --executor=xenna
-      --input-path={dataset:tinystories_train,parquet}
+      --input-path={dataset:tinystories,parquet}
       --dataset-size-gb=10
       --model-identifier=sentence-transformers/all-MiniLM-L6-v2
       --model-inference-batch-size=1024
     timeout_s: 20000
 
-  - name: removal_raydata
-    enabled: false
-    script: removal_benchmark.py
+  - name: fuzzy_dedup_identification
+    enabled: true
+    script: fuzzy_dedup_identification_benchmark.py
     args: >-
-      --executor=ray_data
-      --input-path={dataset:tinystories_train,parquet}
-      --ids-to-remove-path=some_path
-      --id-generator-path=some_path
+      --input-path={dataset:commoncrawl,jsonl}
+      --cache-path={session_entry_dir}/scratch/cache
+      --output-path={session_entry_dir}/output
+      --input-filetype=jsonl
+      --bands-per-iteration=20
+      --text-field=text
+      --input-blocksize=1.5GiB
+    timeout_s: 20000
+    ray:
+      num_cpus: 64
+      num_gpus: 4
+      enable_object_spilling: false
+
+  - name: dedup_removal_raydata
+    enabled: true
+    script: dedup_removal_benchmark.py
+    args: >-
+      --input-path={dataset:commoncrawl,jsonl}
+      --id-generator-path={dataset:commoncrawl_id_map,json}
+      --ids-to-remove-path={dataset:commoncrawl_ids,parquet}
       --output-path={session_entry_dir}/scratch/output
-      --input-filetype=parquet
-      --input-fields=id,text
-      --input-id-field=CURATOR_DEDUP_ID_STR
-      --input-files-per-partition=1
-      --ids-to-remove-fields=id
+      --executor=ray_data
+      --input-filetype=jsonl
       --output-filetype=parquet
+      --id-field=_curator_dedup_id
+      --duplicate-id-field=_curator_dedup_id
+      --blocksize=1.5GiB
     timeout_s: 20000
     ray:
       num_cpus: 64
       num_gpus: 4
       enable_object_spilling: false
 
-  - name: removal_xenna
-    enabled: false
-    script: removal_benchmark.py
+  - name: dedup_removal_xenna
+    enabled: true
+    script: dedup_removal_benchmark.py
     args: >-
-      --executor=xenna
-      --input-path={dataset:tinystories_train,parquet}
-      --ids-to-remove-path=some_path
-      --id-generator-path=some_path
+      --input-path={dataset:commoncrawl,jsonl}
+      --id-generator-path={dataset:commoncrawl_id_map,json}
+      --ids-to-remove-path={dataset:commoncrawl_ids,parquet}
       --output-path={session_entry_dir}/scratch/output
-      --input-filetype=parquet
-      --input-fields=id,text
-      --input-id-field=CURATOR_DEDUP_ID_STR
-      --input-files-per-partition=1
-      --ids-to-remove-fields=id
+      --executor=xenna
+      --input-filetype=jsonl
       --output-filetype=parquet
+      --id-field=_curator_dedup_id
+      --duplicate-id-field=_curator_dedup_id
+      --blocksize=1.5GiB
     timeout_s: 20000
@@ -38,14 +38,15 @@
 
 # ruff: noqa: E402
 from runner.datasets import DatasetResolver
+from runner.entry import Entry
 from runner.env_capture import dump_env
-from runner.matrix import MatrixConfig, MatrixEntry
 from runner.path_resolver import PathResolver
 from runner.process import run_command_with_timeout
 from runner.ray_cluster import (
     setup_ray_cluster_and_env,
     teardown_ray_cluster_and_env,
 )
+from runner.session import Session
 from runner.utils import find_result, get_obj_for_json, resolve_env_vars
 
 
@@ -131,7 +132,7 @@ def check_requirements_update_results(result_data: dict[str, Any], requirements:
 
 
 def run_entry(
-    entry: MatrixEntry,
+    entry: Entry,
     path_resolver: PathResolver,
     dataset_resolver: DatasetResolver,
     session_path: Path,
@@ -148,17 +149,19 @@ def run_entry(
     ]
     cmd = entry.get_command_to_run(session_entry_path, benchmark_results_path, path_resolver, dataset_resolver)
     run_id = result_data.get("run_id", f"{entry.name}-{int(time.time())}")
+    ray_client = ray_temp_dir = None
 
     try:
         # Create directories individually
         for directory in [scratch_path, ray_cluster_path, logs_path, benchmark_results_path]:
             create_or_overwrite_dir(directory)
 
-        ray_client, ray_temp_dir, ray_env = setup_ray_cluster_and_env(
+        ray_client, ray_temp_dir = setup_ray_cluster_and_env(
             num_cpus=entry.ray.get("num_cpus", os.cpu_count() or 1),
             num_gpus=entry.ray.get("num_gpus", 0),
             enable_object_spilling=bool(entry.ray.get("enable_object_spilling", False)),
             ray_log_path=logs_path / "ray.log",
+            object_store_size_bytes=entry.object_store_size_bytes,
         )
 
         # Execute command with timeout
@@ -168,7 +171,6 @@ def run_entry(
             command=cmd,
             timeout=entry.timeout_s,
             stdouterr_path=logs_path / "stdouterr.log",
-            env=ray_env,
             run_id=run_id,
             fancy=os.environ.get("CURATOR_BENCHMARKING_DEBUG", "0") == "0",
         )
@@ -252,40 +254,47 @@ def main() -> int:
                 config_dict.update(d)
     # Preprocess the config dict prior to creating objects from it
     try:
-        MatrixConfig.assert_valid_config_dict(config_dict)
+        Session.assert_valid_config_dict(config_dict)
         config_dict = resolve_env_vars(config_dict)
     except ValueError as e:
         logger.error(f"Invalid configuration: {e}")
         return 1
 
-    config = MatrixConfig.create_from_dict(config_dict)
+    session = Session.create_from_dict(config_dict)
 
     # Create session folder under results_dir
     session_name = args.session_name or time.strftime("benchmark-run__%Y-%m-%d__%H-%M-%S")
-    session_path = (config.results_path / session_name).absolute()
+    session_path = (session.results_path / session_name).absolute()
     ensure_dir(session_path)
 
     session_overall_success = True
     logger.info(f"Started session {session_name}...")
-    env_dict = dump_env(session_path)
+    env_dict = dump_env(session_obj=session, output_path=session_path)
 
-    for sink in config.sinks:
-        sink.initialize(session_name=session_name, matrix_config=config, env_dict=env_dict)
+    for sink in session.sinks:
+        sink.initialize(session_name=session_name, matrix_config=session, env_dict=env_dict)
 
-    for entry in config.entries:
+    # Print a summary of the entries that will be run in the for loop below
+    # Disabled entries will not be printed
+    # TODO: should entries be created unconditionally and have an "enabled" field instead?
+    logger.info("Benchmark entries to be run in this session:")
+    for idx, entry in enumerate(session.entries, start=1):
+        logger.info(f"\t{idx}. {entry.name}")
+
+    for entry in session.entries:
         run_success = False
         run_id = f"{entry.name}-{int(time.time())}"
         result_data = {
             "name": entry.name,
             "run_id": run_id,
             "success": run_success,
         }
-        logger.info(f"\tRunning {entry.name} (run ID: {run_id})")
+        logger.info(f"🚀 Running {entry.name} (run ID: {run_id})")
         try:
             run_success = run_entry(
                 entry=entry,
-                path_resolver=config.path_resolver,
-                dataset_resolver=config.dataset_resolver,
+                path_resolver=session.path_resolver,
+                dataset_resolver=session.dataset_resolver,
                 session_path=session_path,
                 result_data=result_data,
             )
@@ -305,10 +314,10 @@ def main() -> int:
 
         finally:
             session_overall_success &= run_success
-            for sink in config.sinks:
+            for sink in session.sinks:
                 sink.process_result(result_dict=result_data, matrix_entry=entry)
 
-    for sink in config.sinks:
+    for sink in session.sinks:
         sink.finalize()
     logger.info(f"Session {session_name} completed with overall success: {session_overall_success}")
     return 0 if session_overall_success else 1