NVIDIA-NeMo
diff --git a/‎benchmarking/Dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎benchmarking/Dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarking/README.md‎
Lines changed: 7 additions & 5 deletions b/‎benchmarking/README.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎benchmarking/nightly-benchmark.yaml‎
Lines changed: 57 additions & 4 deletions b/‎benchmarking/nightly-benchmark.yaml‎
Lines changed: 57 additions & 4 deletions
diff --git a/‎benchmarking/run.py‎
Lines changed: 8 additions & 4 deletions b/‎benchmarking/run.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎benchmarking/runner/sinks/gdrive_sink.py‎
Lines changed: 13 additions & 10 deletions b/‎benchmarking/runner/sinks/gdrive_sink.py‎
Lines changed: 13 additions & 10 deletions
diff --git a/‎benchmarking/runner/sinks/mlflow_sink.py‎
Lines changed: 12 additions & 9 deletions b/‎benchmarking/runner/sinks/mlflow_sink.py‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎benchmarking/runner/sinks/sink.py‎
Lines changed: 14 additions & 5 deletions b/‎benchmarking/runner/sinks/sink.py‎
Lines changed: 14 additions & 5 deletions
@@ -36,6 +36,7 @@ RUN cd /opt/Curator \
     pynvml \
     pyyaml \
     rich \
+    slack_sdk \
   && uv cache prune
 
 # Add the Curator repo to the safe.directory list to avoid GitPython warnings
 
@@ -148,7 +148,7 @@ sinks:
     experiment: my-experiment
   - name: slack
     enabled: true
-    webhook_url: ${SLACK_WEBHOOK_URL}
+    channel_id: ${SLACK_CHANNEL_ID}
     default_metrics: ["exec_time_s"]  # Metrics to report by default for all entries
   - name: gdrive
     enabled: false
@@ -223,7 +223,7 @@ Configuration values can reference environment variables using `${VAR_NAME}` syn
 results_path: "${HOME}/benchmarks/results"
 sinks:
   - name: slack
-    webhook_url: ${SLACK_WEBHOOK_URL}
+    channel_id: ${SLACK_CHANNEL_ID}
   - name: mlflow
     tracking_uri: ${MLFLOW_TRACKING_URI}
 ```
@@ -311,7 +311,7 @@ This command:
 - Reads the configuration file and extracts `results_path` and `datasets_path`
 - Automatically creates volume mounts to map these paths into the container
 - Runs the benchmarking framework with the Curator code built into the Docker image
-- Passes environment variables like `SLACK_WEBHOOK_URL` and `MLFLOW_TRACKING_URI` to the container
+- Passes environment variables like `SLACK_BOT_TOKEN`, `SLACK_CHANNEL_ID`, and `MLFLOW_TRACKING_URI` to the container
 
 ### Using Host Curator Sources
 
@@ -459,11 +459,13 @@ Posts results to Slack channels:
 ```yaml
 sinks:
   - name: slack
-    webhook_url: https://hooks.slack.com/services/YOUR/WEBHOOK/URL
+    channel_id: C1234567890  # Your Slack channel ID
     enabled: true
 ```
 
-Results are formatted as interactive Slack messages with environment info and metrics.
+Results are posted as interactive Slack messages with environment info and metrics. Requires:
+- `SLACK_BOT_TOKEN` environment variable set to your Slack Bot User OAuth Token
+- `SLACK_CHANNEL_ID` in config or environment variable for the target channel
 
 #### Google Drive Sink
 
 
@@ -55,10 +55,16 @@ datasets:
     formats:
     - type: "tar"
       path: "{datasets_path}/arxiv_downloads"
-  - name: "fasttext_model"
+  - name: "fasttext_langid_model"
     formats:
     - type: "bin"
       path: "{model_weights_path}/fasttext/lid.176.bin"
+    - type: "ftz"
+      path: "{model_weights_path}/fasttext/lid.176.ftz"
+  - name: "fasttext_quality_model"
+    formats:
+    - type: "bin"
+      path: "{model_weights_path}/fasttext/model.bin"
   - name: "gretel_symptoms"
     formats:
     - type: "jsonl"
@@ -73,7 +79,8 @@ sinks:
 #    experiment: ray-curator-common-crawl
   - name: slack
     enabled: true
-    webhook_url: ${SLACK_WEBHOOK_URL}
+    live_updates: true
+    channel_id: ${SLACK_CHANNEL_ID}
     default_metrics: ["exec_time_s"]
 #  - name: gdrive
 #    enabled: false
@@ -411,6 +418,52 @@ entries:
       - metric: throughput_docs_per_sec
         min_value: 8500
 
+  - name: fasttext_filter_raydata
+    enabled: true
+    script: fasttext_filter_benchmark.py
+    args: >-
+      --benchmark-results-path={session_entry_dir}
+      --output-path={session_entry_dir}/scratch/output
+      --executor=ray_data
+      --input-path={dataset:tinystories,parquet}
+      --yaml-config={curator_repo_dir}/nemo_curator/config/text/fasttext_filter_pipeline.yaml
+      --fasttext-langid-model-path={dataset:fasttext_langid_model,bin}
+      --fasttext-quality-model-path={dataset:fasttext_quality_model,bin}
+      --overrides="stages.0._target_=nemo_curator.stages.text.io.reader.ParquetReader"
+    timeout_s: 200
+    sink_data:
+      - name: slack
+        additional_metrics:
+          - num_kept_documents
+          - throughput_docs_per_sec
+    ray:
+      num_cpus: 64
+      num_gpus: 0
+      enable_object_spilling: false
+
+  - name: fasttext_filter_xenna
+    enabled: true
+    script: fasttext_filter_benchmark.py
+    args: >-
+      --benchmark-results-path={session_entry_dir}
+      --output-path={session_entry_dir}/scratch/output
+      --executor=xenna
+      --input-path={dataset:tinystories,parquet}
+      --yaml-config={curator_repo_dir}/nemo_curator/config/text/fasttext_filter_pipeline.yaml
+      --fasttext-langid-model-path={dataset:fasttext_langid_model,bin}
+      --fasttext-quality-model-path={dataset:fasttext_quality_model,bin}
+      --overrides="stages.0._target_=nemo_curator.stages.text.io.reader.ParquetReader"
+    timeout_s: 100
+    sink_data:
+      - name: slack
+        additional_metrics:
+          - num_kept_documents
+          - throughput_docs_per_sec
+    ray:
+      num_cpus: 64
+      num_gpus: 0
+      enable_object_spilling: false
+
   - name: modifier_raydata
     enabled: true
     script: modifier_benchmark.py
@@ -493,7 +546,7 @@ entries:
       --benchmark-results-path={session_entry_dir}
       --tar-input-path={dataset:arxiv_downloads,tar}
       --output-path={session_entry_dir}/scratch/output
-      --fasttext-model-path={dataset:fasttext_model,bin}
+      --fasttext-langid-model-path={dataset:fasttext_langid_model,bin}
       --executor=ray_data
     timeout_s: 3600
     sink_data:
@@ -522,7 +575,7 @@ entries:
       --benchmark-results-path={session_entry_dir}
       --tar-input-path={dataset:arxiv_downloads,tar}
       --output-path={session_entry_dir}/scratch/output
-      --fasttext-model-path={dataset:fasttext_model,bin}
+      --fasttext-langid-model-path={dataset:fasttext_langid_model,bin}
       --executor=xenna
     timeout_s: 3600
     sink_data:
 
@@ -255,15 +255,15 @@ def run_entry(
             shutil.rmtree(scratch_path, ignore_errors=True)
 
 
-def main() -> int:  # noqa: C901
+def main() -> int:  # noqa: C901, PLR0912
     parser = argparse.ArgumentParser(description="Runs the benchmarking application")
     parser.add_argument(
         "--config",
         type=Path,
         action="append",
         required=True,
         help=(
-            "Path to YAML config for benchmark matrix, machine paths, etc. Can be "
+            "Path to YAML config for the benchmark entries, machine paths, etc. Can be "
             "specified multiple times to merge configs."
         ),
     )
@@ -322,7 +322,7 @@ def main() -> int:  # noqa: C901
     env_dict = dump_env(session_obj=session, output_path=session_path)
 
     for sink in session.sinks:
-        sink.initialize(session_name=session_name, matrix_config=session, env_dict=env_dict)
+        sink.initialize(session_name=session_name, session=session, env_dict=env_dict)
 
     # Print a summary of the entries that will be run in the for loop below
     # Disabled entries will not be printed
@@ -339,6 +339,10 @@ def main() -> int:  # noqa: C901
             "success": run_success,
         }
         logger.info(f"🚀 Running {entry.name} (run ID: {run_id})")
+
+        for sink in session.sinks:
+            sink.register_benchmark_entry_starting(result_dict=result_data, benchmark_entry=entry)
+
         try:
             run_success = run_entry(
                 entry=entry,
@@ -364,7 +368,7 @@ def main() -> int:  # noqa: C901
         finally:
             session_overall_success &= run_success
             for sink in session.sinks:
-                sink.process_result(result_dict=result_data, matrix_entry=entry)
+                sink.register_benchmark_entry_finished(result_dict=result_data, benchmark_entry=entry)
 
     for sink in session.sinks:
         sink.finalize()
 
@@ -33,15 +33,15 @@ def __init__(self, sink_config: dict[str, Any]):
         super().__init__(sink_config)
         self.sink_config = sink_config
         self.results: list[dict[str, Any]] = []
-        self.session_name: str = None
-        self.matrix_config: Session = None
-        self.env_dict: dict[str, Any] = None
-        self.drive_folder_id: str = None
-        self.service_account_file: str = None
+        self.session_name: str | None = None
+        self.session: Session | None = None
+        self.env_dict: dict[str, Any] | None = None
+        self.drive_folder_id: str | None = None
+        self.service_account_file: str | None = None
 
-    def initialize(self, session_name: str, matrix_config: Session, env_dict: dict[str, Any]) -> None:
+    def initialize(self, session_name: str, session: Session, env_dict: dict[str, Any]) -> None:
         self.session_name = session_name
-        self.matrix_config = matrix_config
+        self.session = session
         self.env_dict = env_dict
         self.drive_folder_id = self.sink_config.get("drive_folder_id")
         if not self.drive_folder_id:
@@ -52,7 +52,10 @@ def initialize(self, session_name: str, matrix_config: Session, env_dict: dict[s
             msg = "GdriveSink: No service account file configured"
             raise ValueError(msg)
 
-    def process_result(self, result_dict: dict[str, Any], matrix_entry: Entry) -> None:
+    def register_benchmark_entry_starting(self, result_dict: dict[str, Any], benchmark_entry: Entry) -> None:
+        pass
+
+    def register_benchmark_entry_finished(self, result_dict: dict[str, Any], benchmark_entry: Entry) -> None:
         pass
 
     def finalize(self) -> None:
@@ -66,8 +69,8 @@ def finalize(self) -> None:
             self._delete_tar_file(tar_path)
 
     def _tar_results_and_artifacts(self) -> Path:
-        results_path = Path(self.matrix_config.results_path)
-        artifacts_path = Path(self.matrix_config.artifacts_dir)
+        results_path = Path(self.session.results_path)
+        artifacts_path = Path(self.session.artifacts_dir)
         tar_path = results_path / f"{self.session_name}.tar.gz"
         with tarfile.open(tar_path, "w:gz") as tar:
             tar.add(results_path, arcname=results_path.name)
 
@@ -34,20 +34,23 @@ def __init__(self, sink_config: dict[str, Any]):
             msg = "MlflowSink: No experiment configured"
             raise ValueError(msg)
         self.results: list[dict[str, Any]] = []
-        self.session_name: str = None
-        self.matrix_config: Session = None
-        self.env_dict: dict[str, Any] = None
+        self.session_name: str | None = None
+        self.session: Session | None = None
+        self.env_dict: dict[str, Any] | None = None
 
-    def initialize(self, session_name: str, matrix_config: Session, env_dict: dict[str, Any]) -> None:
+    def initialize(self, session_name: str, session: Session, env_dict: dict[str, Any]) -> None:
         self.session_name = session_name
-        self.matrix_config = matrix_config
+        self.session = session
         self.env_dict = env_dict
 
-    def process_result(self, result_dict: dict[str, Any], matrix_entry: Entry) -> None:
-        # Use the matrix_entry to get any entry-specific settings for the Slack report
+    def register_benchmark_entry_starting(self, result_dict: dict[str, Any], benchmark_entry: Entry) -> None:
+        pass
+
+    def register_benchmark_entry_finished(self, result_dict: dict[str, Any], benchmark_entry: Entry) -> None:
+        # Use the benchmark_entry to get any entry-specific settings for the Slack report
         # such as additional metrics to include in the report.
-        if matrix_entry:
-            additional_metrics = matrix_entry.get_sink_data(self.name).get("additional_metrics", [])
+        if benchmark_entry:
+            additional_metrics = benchmark_entry.get_sink_data(self.name).get("additional_metrics", [])
         else:
             additional_metrics = []
         self.results.append((additional_metrics, result_dict))
 
@@ -33,24 +33,33 @@ def __init__(self, sink_config: dict[str, Any]):
     def initialize(
         self,
         session_name: str,
-        matrix_config: Session,
+        session: Session,
         env_dict: dict[str, Any],
     ) -> None:
         """Initialize the sink for a benchmark session.
 
         Args:
             session_name: Name of the benchmark session.
-            matrix_config: Session configuration for the session.
+            session: Session configuration for the session.
             env_dict: Environment dictionary for the session.
         """
 
     @abstractmethod
-    def process_result(self, result_dict: dict[str, Any], matrix_entry: Entry) -> None:
-        """Process an individual benchmark result.
+    def register_benchmark_entry_starting(self, result_dict: dict[str, Any], benchmark_entry: Entry) -> None:
+        """Register that a benchmark entry is starting.
+
+        Args:
+            result_dict: Dictionary containing benchmark entry data.
+            benchmark_entry: Entry configuration.
+        """
+
+    @abstractmethod
+    def register_benchmark_entry_finished(self, result_dict: dict[str, Any], benchmark_entry: Entry) -> None:
+        """Register that a benchmark entry has finished.
 
         Args:
             result_dict: Dictionary containing benchmark result data.
-            matrix_entry: Entry configuration.
+            benchmark_entry: Entry configuration.
         """
 
     @abstractmethod