feat(exporters): put the NEL config in the MLflow parameters (#653)

piojanu · web-flow · commit 6c7150432265 · 2026-01-23T15:02:04.000+01:00
Signed-off-by: Piotr Januszewski &lt;pjanuszewski@nvidia.com&gt;
diff --git a/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/exporters/mlflow.py b/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/exporters/mlflow.py
@@ -35,6 +35,7 @@
 from nemo_evaluator_launcher.exporters.utils import (
     extract_accuracy_metrics,
     extract_exporter_config,
+    flatten_config,
     get_artifact_root,
     get_available_artifacts,
     get_benchmark_info,
@@ -178,6 +179,15 @@ def export_job(self, job_data: JobData) -> ExportResult:
             if mlflow_config.get("extra_metadata"):
                 all_params.update(mlflow_config["extra_metadata"])
 
+            # Add flattened config as params if enabled
+            if mlflow_config.get("log_config_params", False):
+                config_params = flatten_config(
+                    job_data.config or {},
+                    parent_key="config",
+                    max_depth=mlflow_config.get("log_config_params_max_depth", 10),
+                )
+                all_params.update(config_params)
+
             # Add webhook info if available
             if mlflow_config.get("triggered_by_webhook"):
                 all_params.update(
@@ -525,16 +535,31 @@ def export_invocation(self, invocation_id: str) -> Dict[str, Any]:
             if mlflow_config.get("extra_metadata"):
                 all_params.update(mlflow_config["extra_metadata"])
 
+            # Add flattened config as params if enabled
+            if mlflow_config.get("log_config_params", False):
+                config_params = flatten_config(
+                    first_job.config or {},
+                    parent_key="config",
+                    max_depth=mlflow_config.get("log_config_params_max_depth", 10),
+                )
+                all_params.update(config_params)
+
             # Prepare tags
             tags = {"invocation_id": invocation_id}
             if mlflow_config.get("tags"):
                 tags.update({k: v for k, v in mlflow_config["tags"].items() if v})
 
-            # Truncate
+            # Sanitize params and tags
             safe_params = {
-                str(k)[:250]: str(v)[:250] for k, v in all_params.items() if v
+                mlflow_sanitize(k, "param_key"): mlflow_sanitize(v, "param_value")
+                for k, v in (all_params or {}).items()
+                if v is not None
+            }
+            safe_tags = {
+                mlflow_sanitize(k, "tag_key"): mlflow_sanitize(v, "tag_value")
+                for k, v in (tags or {}).items()
+                if v is not None
             }
-            safe_tags = {str(k)[:250]: str(v)[:5000] for k, v in tags.items() if v}
 
             # Check for existing run
             exists, existing_run_id = self._get_existing_run_info(
diff --git a/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/exporters/utils.py b/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/exporters/utils.py
@@ -416,12 +416,19 @@ def scp_file(remote_path: str, local_path: Path) -> bool:
                 if scp_file(remote_file, local_file):
                     exported_files.append(str(local_file))
         else:
-            # Copy known files individually to avoid subfolders and satisfy tests
-            for artifact in get_available_artifacts(paths.get("artifacts_dir", Path())):
-                remote_file = f"{paths['remote_path']}/artifacts/{artifact}"
-                local_file = art_dir / artifact
-                if scp_file(remote_file, local_file):
-                    exported_files.append(str(local_file))
+            # Copy all artifacts recursively when only_required=False
+            cmd = (
+                ["scp", "-r"]
+                + ssh_opts
+                + [
+                    f"{paths['username']}@{paths['hostname']}:{paths['remote_path']}/artifacts/.",
+                    str(art_dir),
+                ]
+            )
+            if subprocess.run(cmd, capture_output=True).returncode == 0:
+                exported_files.extend(
+                    [str(f) for f in art_dir.rglob("*") if f.is_file()]
+                )
 
     # Logs (top-level only)
     if copy_logs:
@@ -586,6 +593,60 @@ def _safe_update_metrics(
         _safe_set_metric(target, k, v, context)
 
 
+# =============================================================================
+# CONFIG FLATTENING
+# =============================================================================
+
+
+def flatten_config(
+    config: Any,
+    parent_key: str = "",
+    sep: str = ".",
+    max_depth: int = 10,
+) -> Dict[str, str]:
+    """
+    Flatten a nested config dict into dot-notation keys.
+
+    Args:
+        config: Nested configuration (dict, list, or scalar)
+        parent_key: Prefix for keys (used in recursion)
+        sep: Separator between nested keys
+        max_depth: Maximum recursion depth to prevent infinite loops
+
+    Returns:
+        Flattened dictionary with string values
+
+    Examples:
+        >>> flatten_config({"a": {"b": 1}})
+        {"a.b": "1"}
+        >>> flatten_config({"tasks": [{"name": "foo"}, {"name": "bar"}]})
+        {"tasks.0.name": "foo", "tasks.1.name": "bar"}
+    """
+    if max_depth <= 0:
+        return {parent_key: str(config)} if parent_key else {}
+
+    if isinstance(config, dict):
+        items: Dict[str, str] = {}
+        for key, value in config.items():
+            new_key = f"{parent_key}{sep}{key}" if parent_key else key
+            items.update(flatten_config(value, new_key, sep, max_depth - 1))
+        return items
+
+    if isinstance(config, list):
+        items: Dict[str, str] = {}
+        for idx, item in enumerate(config):
+            item_key = f"{parent_key}{sep}{idx}" if parent_key else str(idx)
+            items.update(flatten_config(item, item_key, sep, max_depth - 1))
+        return items
+
+    # Scalar value
+    if not parent_key:
+        return {}
+    if config is None:
+        return {parent_key: "null"}
+    return {parent_key: str(config)}
+
+
 # =============================================================================
 # MLFLOW FUNCTIONS
 # =============================================================================
diff --git a/packages/nemo-evaluator-launcher/tests/unit_tests/exporters/test_mlflow_exporter.py b/packages/nemo-evaluator-launcher/tests/unit_tests/exporters/test_mlflow_exporter.py
@@ -377,3 +377,104 @@ def _log_artifact(path, artifact_path=None):
         assert res.metadata["artifacts_logged"] == 1
         assert calls["config"] == 1
         assert calls["files"] == [("results.yml", "taskX/artifacts")]
+
+    def test_log_config_params_flattens_config(
+        self, monkeypatch, mlflow_fake, tmp_path: Path
+    ):
+        """Test that log_config_params=True flattens the config into MLflow params."""
+        _ML, _RunCtx = mlflow_fake
+
+        # Job with nested config
+        config = {
+            "deployment": {"tensor_parallel_size": 8, "model": "test-model"},
+            "evaluation": {
+                "tasks": [
+                    {"name": "task1", "config": {"param": "value1"}},
+                    {"name": "task2"},
+                ]
+            },
+        }
+        jd = JobData("mP", "mP.0", 0.0, "local", {"output_dir": str(tmp_path)}, config)
+
+        monkeypatch.setattr(
+            "nemo_evaluator_launcher.exporters.mlflow.extract_accuracy_metrics",
+            lambda *_: {"task_accuracy": 0.9},
+            raising=True,
+        )
+
+        # Capture log_params calls
+        logged_params = {}
+        monkeypatch.setattr(
+            "nemo_evaluator_launcher.exporters.mlflow.mlflow.log_params",
+            lambda p: logged_params.update(p),
+            raising=True,
+        )
+
+        exp = MLflowExporter(
+            {
+                "tracking_uri": "http://mlflow",
+                "log_config_params": True,
+            }
+        )
+        monkeypatch.setattr(
+            exp, "_get_existing_run_info", lambda *a, **k: (False, None), raising=False
+        )
+
+        res = exp.export_job(jd)
+        assert res.success
+
+        # Verify flattened config params are logged
+        assert "config.deployment.tensor_parallel_size" in logged_params
+        assert logged_params["config.deployment.tensor_parallel_size"] == "8"
+        assert "config.deployment.model" in logged_params
+        assert logged_params["config.deployment.model"] == "test-model"
+        assert "config.evaluation.tasks.0.name" in logged_params
+        assert logged_params["config.evaluation.tasks.0.name"] == "task1"
+        assert "config.evaluation.tasks.1.name" in logged_params
+        assert logged_params["config.evaluation.tasks.1.name"] == "task2"
+
+    def test_log_config_params_with_max_depth(
+        self, monkeypatch, mlflow_fake, tmp_path: Path
+    ):
+        """Test that log_config_params_max_depth limits flattening depth."""
+        _ML, _RunCtx = mlflow_fake
+
+        config = {"a": {"b": {"c": {"d": "deep"}}}}
+        jd = JobData(
+            "mDepth", "mDepth.0", 0.0, "local", {"output_dir": str(tmp_path)}, config
+        )
+
+        monkeypatch.setattr(
+            "nemo_evaluator_launcher.exporters.mlflow.extract_accuracy_metrics",
+            lambda *_: {"acc": 0.9},
+            raising=True,
+        )
+
+        logged_params = {}
+        monkeypatch.setattr(
+            "nemo_evaluator_launcher.exporters.mlflow.mlflow.log_params",
+            lambda p: logged_params.update(p),
+            raising=True,
+        )
+
+        exp = MLflowExporter(
+            {
+                "tracking_uri": "http://mlflow",
+                "log_config_params": True,
+                "log_config_params_max_depth": 2,
+            }
+        )
+        monkeypatch.setattr(
+            exp, "_get_existing_run_info", lambda *a, **k: (False, None), raising=False
+        )
+
+        res = exp.export_job(jd)
+        assert res.success
+
+        # At depth 2, a.b should be stringified (not further flattened)
+        assert "config.a.b" in logged_params
+        # The value should be a string representation of the remaining dict
+        assert "c" in logged_params["config.a.b"]
+        # Deeper keys should not exist
+        assert "config.a.b.c" not in logged_params
+        assert "config.a.b.c.d" not in logged_params
diff --git a/packages/nemo-evaluator-launcher/tests/unit_tests/exporters/test_utils.py b/packages/nemo-evaluator-launcher/tests/unit_tests/exporters/test_utils.py
@@ -37,6 +37,7 @@
     MetricConflictError,
     _safe_update_metrics,
     extract_accuracy_metrics,
+    flatten_config,
     get_available_artifacts,
     get_benchmark_info,
     get_container_from_mapping,
@@ -323,24 +324,41 @@ def test_download_artifacts_only_required_with_logs(self, tmp_path: Path):
         }
         assert set(out).issuperset(expected_artifacts)
 
-    def test_download_artifacts_available_only(self, tmp_path: Path):
-        local_artifacts = tmp_path / "local_artifacts"
-        local_artifacts.mkdir()
-        (local_artifacts / "results.yml").write_text("x")
-
+    def test_download_artifacts_only_required_false_uses_scp_recursive(
+        self, tmp_path: Path
+    ):
+        """Test only_required=False uses scp -r to copy all artifacts recursively."""
         paths = {
             "username": "user",
             "hostname": "host",
             "remote_path": "/remote",
-            "artifacts_dir": local_artifacts,
         }
+        scp_calls = []
+
+        # Mock scp -r to simulate creating files in the target directory
+        def fake_run(cmd, capture_output=True):
+            scp_calls.append(cmd)
+            if "-r" in cmd:
+                # Simulate scp -r by creating files including nested dirs
+                art_dir = tmp_path / "artifacts"
+                art_dir.mkdir(parents=True, exist_ok=True)
+                (art_dir / "results.yml").write_text("x")
+                (art_dir / "extra.json").write_text("{}")
+                (art_dir / "subdir").mkdir(exist_ok=True)
+                (art_dir / "subdir" / "nested.txt").write_text("nested")
+            return SimpleNamespace(returncode=0)
 
-        with patch("subprocess.run", return_value=SimpleNamespace(returncode=0)):
+        with patch("subprocess.run", side_effect=fake_run):
             out = U.ssh_download_artifacts(
                 paths, tmp_path, config={"only_required": False}, control_paths=None
             )
 
+        # Verify scp -r was called
+        assert any("-r" in c for c in scp_calls)
+        # Verify all files including nested are listed
         assert str(tmp_path / "artifacts" / "results.yml") in out
+        assert str(tmp_path / "artifacts" / "extra.json") in out
+        assert str(tmp_path / "artifacts" / "subdir" / "nested.txt") in out
 
     def test_download_with_control_paths(self, tmp_path: Path, monkeypatch):
         paths = {"username": "u", "hostname": "h", "remote_path": "/remote"}
@@ -593,3 +611,86 @@ def export_invocation(self, inv_id):
             # metadata injected for each job
             for job in payload["jobs"].values():
                 assert "metadata" in job
+
+
+class TestFlattenConfig:
+    def test_simple_dict(self):
+        config = {"a": 1, "b": "hello"}
+        result = flatten_config(config)
+        assert result == {"a": "1", "b": "hello"}
+
+    def test_nested_dict(self):
+        config = {"a": {"b": {"c": 42}}}
+        result = flatten_config(config)
+        assert result == {"a.b.c": "42"}
+
+    def test_with_parent_key(self):
+        config = {"x": 1}
+        result = flatten_config(config, parent_key="config")
+        assert result == {"config.x": "1"}
+
+    def test_list_with_scalars(self):
+        config = {"items": ["a", "b", "c"]}
+        result = flatten_config(config)
+        assert result == {"items.0": "a", "items.1": "b", "items.2": "c"}
+
+    def test_list_with_dicts(self):
+        config = {"tasks": [{"name": "foo"}, {"name": "bar"}]}
+        result = flatten_config(config)
+        assert result == {"tasks.0.name": "foo", "tasks.1.name": "bar"}
+
+    def test_nested_list_of_dicts(self):
+        config = {
+            "evaluation": {
+                "tasks": [
+                    {"name": "task1", "config": {"param": "value1"}},
+                    {"name": "task2", "config": {"param": "value2"}},
+                ]
+            }
+        }
+        result = flatten_config(config, parent_key="config")
+        assert result["config.evaluation.tasks.0.name"] == "task1"
+        assert result["config.evaluation.tasks.0.config.param"] == "value1"
+        assert result["config.evaluation.tasks.1.name"] == "task2"
+        assert result["config.evaluation.tasks.1.config.param"] == "value2"
+
+    def test_null_values(self):
+        config = {"a": None, "b": {"c": None}}
+        result = flatten_config(config)
+        assert result == {"a": "null", "b.c": "null"}
+
+    def test_max_depth_limit(self):
+        config = {"a": {"b": {"c": {"d": "deep"}}}}
+        result = flatten_config(config, max_depth=2)
+        # At depth 2, the inner dict should be stringified
+        assert "a.b" in result
+        assert "{'c': {'d': 'deep'}}" in result["a.b"]
+
+    def test_empty_dict(self):
+        result = flatten_config({})
+        assert result == {}
+
+    def test_empty_list(self):
+        config = {"items": []}
+        result = flatten_config(config)
+        assert result == {}
+
+    def test_mixed_types(self):
+        config = {
+            "string": "hello",
+            "number": 42,
+            "float": 3.14,
+            "bool": True,
+            "none": None,
+        }
+        result = flatten_config(config)
+        assert result["string"] == "hello"
+        assert result["number"] == "42"
+        assert result["float"] == "3.14"
+        assert result["bool"] == "True"
+        assert result["none"] == "null"
+
+    def test_custom_separator(self):
+        config = {"a": {"b": 1}}
+        result = flatten_config(config, sep="/")
+        assert result == {"a/b": "1"}