automl
diff --git a/‎arlbench/autorl/autorl_env.py‎
Lines changed: 40 additions & 2 deletions b/‎arlbench/autorl/autorl_env.py‎
Lines changed: 40 additions & 2 deletions
diff --git a/‎arlbench/core/algorithms/sac/sac.py‎
Lines changed: 2 additions & 1 deletion b/‎arlbench/core/algorithms/sac/sac.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/autorl/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/autorl/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/autorl/test_autorl_env.py‎
Lines changed: 284 additions & 0 deletions b/‎tests/autorl/test_autorl_env.py‎
Lines changed: 284 additions & 0 deletions
@@ -3,12 +3,13 @@
 
 import warnings
 from collections.abc import Callable
-from typing import Any
+from typing import Any, Dict
 
 import gymnasium
 import jax
 import numpy as np
 import pandas as pd
+from omegaconf import OmegaConf
 from ConfigSpace import Configuration, ConfigurationSpace
 
 from arlbench.core.algorithms import (
@@ -265,6 +266,37 @@ def _make_algorithm(self) -> Algorithm:
             cnn_policy=self._config["cnn_policy"],
             deterministic_eval=self._config["deterministic_eval"],
         )
+    
+    def get_algorithm_init_kwargs(self, init_rng) -> Dict:
+        """Returns the algorithm initialization parameters.
+
+        Returns:
+            Dict: Dictionary of algorithm initialization parameters.
+        """
+        if isinstance(self._algorithm, PPO):
+            return {"rng": init_rng, "network_params": self._algorithm_state.runner_state.train_state.params, "opt_state": self._algorithm_state.runner_state.train_state.opt_state}
+        elif isinstance(self._algorithm, DQN):
+            return{
+                    "rng": init_rng,
+                    "buffer_state": self._algorithm_state.buffer_state,
+                    "network_params": self._algorithm_state.runner_state.train_state.params,
+                    "target_params": self._algorithm_state.runner_state.train_state.target_params,
+                    "opt_state": self._algorithm_state.runner_state.train_state.opt_state,
+                }
+        elif isinstance(self._algorithm, SAC):
+            return {
+                    "rng": init_rng,
+                    "buffer_state": self._algorithm_state.buffer_state,
+                    "actor_network_params": self._algorithm_state.runner_state.actor_train_state.params,
+                    "critic_network_params": self._algorithm_state.runner_state.critic_train_state.params,
+                    "critic_target_params": self._algorithm_state.runner_state.critic_train_state.target_params,
+                    "alpha_network_params": self._algorithm_state.runner_state.alpha_train_state.params,
+                    "actor_opt_state": self._algorithm_state.runner_state.actor_train_state.opt_state,
+                    "critic_opt_state": self._algorithm_state.runner_state.critic_train_state.opt_state,
+                    "alpha_opt_state": self._algorithm_state.runner_state.alpha_train_state.opt_state,
+                }
+        else:
+            raise ValueError(f"Unsupported algorithm: {self._algorithm.name}")
 
     def step(
         self,
@@ -304,7 +336,9 @@ def step(
 
         # Apply changes to current hyperparameter configuration and reinstantiate algorithm
         if isinstance(action, dict):
-            action = Configuration(self.config_space, action)
+            action_config = dict(self._hpo_config)
+            action_config.update(action)
+            action = Configuration(self.config_space, action_config)
         self._hpo_config = action
 
         seed = seed if seed else self._seed
@@ -325,6 +359,10 @@ def step(
         elif self._algorithm_state is None:
             init_rng = jax.random.key(seed)
             self._algorithm_state = self._algorithm.init(init_rng)
+        else:
+            init_rng = jax.random.key(seed)
+            init_kwargs = self.get_algorithm_init_kwargs(init_rng)            
+            self._algorithm_state = self._algorithm.init(**init_kwargs)
 
         # Training kwargs
         train_kw_args = {
 
@@ -394,7 +394,8 @@ def init(
             _action = self.env.sample_actions(dummy_rng)
 
             # for x64 enabled runs we have to explicitly cast the dummy action
-            _action = jnp.array(_action, dtype=jnp.float64)
+            dtype = jnp.float64 if jax.config.jax_enable_x64 else jnp.float32
+            _action = jnp.array(_action, dtype=dtype)
 
             _, (_obs, _reward, _done, _) = self.env.step(env_state, _action, dummy_rng)
 
 
@@ -22,6 +22,7 @@ dependencies = [
   "coverage==7.4.4",
   "chex==0.1.86",
   "xminigrid==0.8.0",
+  "gymnasium==1.2.0",
   "ruff",
   "hydra-core",
   "hydra-submitit-launcher",
@@ -57,7 +58,7 @@ tooling = ["commitizen", "pre-commit", "ruff"]
 test = ["pytest", "pytest-coverage", "pytest-cases", "ARLBench[examples]"]
 examples = ["hypersweeper"]
 doc = [
-  "automl_sphinx_theme", "gymnasium==0.29.1"
+  "automl_sphinx_theme"
 ]
 envpool = ["envpool==0.8.4"]
 
 
@@ -0,0 +1 @@
+"""Unit test package for arlbench."""
@@ -0,0 +1 @@
+"""Unit test package for autorl subpackage."""
@@ -0,0 +1,284 @@
+from __future__ import annotations
+
+import pytest
+from arlbench import AutoRLEnv
+from arlbench.core.algorithms import DQN
+
+
+def test_autorl_env_dqn_default_obs():
+    config = {
+        "seed": 42,
+        "env_framework": "gymnax",
+        "env_name": "CartPole-v1",
+        "n_envs": 10,
+        "algorithm": "dqn",
+        "cnn_policy": False,
+        "n_total_timesteps": 1e6,
+        "n_eval_steps": 10,
+        "checkpoint": [],
+        "objectives": ["reward_mean"],
+        "state_features": [],
+        "n_steps": 10,
+    }
+
+    env = AutoRLEnv(config=config)
+    init_obs, _ = env.reset()
+    assert len(init_obs.keys()) == 0
+
+    action = env.config_space.sample_configuration()
+    obs, objectives, _, trunc, _ = env.step(action)
+    assert len(obs.keys()) == 1
+    assert obs["steps"].shape == (2,)
+    assert trunc is False
+    assert objectives["reward_mean"] > 0
+
+
+def test_autorl_env_dqn_grad_obs():
+    config = {
+        "seed": 42,
+        "env_framework": "gymnax",
+        "env_name": "CartPole-v1",
+        "n_envs": 10,
+        "algorithm": "dqn",
+        "cnn_policy": False,
+        "n_total_timesteps": 1e5,
+        "n_eval_steps": 10,
+        "checkpoint": [],
+        "objectives": ["reward_mean"],
+        "state_features": ["grad_info"],
+        "n_steps": 10,
+    }
+
+    env = AutoRLEnv(config=config)
+    init_obs, _ = env.reset()
+    assert len(init_obs.keys()) == 0
+
+    action = env.config_space.get_default_configuration()
+    obs, objectives, _, trunc, _ = env.step(action)
+    assert len(obs.keys()) == 2
+    assert obs["steps"].shape == (2,)
+    assert obs["grad_info"].shape == (2,)
+    assert trunc is False
+    assert objectives["reward_mean"] > 0
+
+
+def test_autorl_env_ppo_grad_obs():
+    config = {
+        "seed": 42,
+        "env_framework": "gymnax",
+        "env_name": "CartPole-v1",
+        "n_envs": 10,
+        "algorithm": "ppo",
+        "cnn_policy": False,
+        "n_total_timesteps": 1e5,
+        "n_eval_steps": 10,
+        "checkpoint": [],
+        "objectives": ["reward_mean"],
+        "state_features": ["grad_info"],
+        "n_steps": 10,
+    }
+
+    env = AutoRLEnv(config=config)
+    init_obs, _ = env.reset()
+    assert len(init_obs.keys()) == 0
+
+    action = env.config_space.get_default_configuration()
+    obs, objectives, _, trunc, _ = env.step(action)
+    assert len(obs.keys()) == 2
+    assert obs["steps"].shape == (2,)
+    assert obs["grad_info"].shape == (2,)
+    assert trunc is False
+    assert objectives["reward_mean"] > 0
+
+
+def test_autorl_env_sac_grad_obs():
+    config = {
+        "seed": 42,
+        "env_framework": "gymnax",
+        "env_name": "Pendulum-v1",
+        "n_envs": 10,
+        "algorithm": "sac",
+        "cnn_policy": False,
+        "n_total_timesteps": 5e4,
+        "n_eval_steps": 10,
+        "checkpoint": [],
+        "objectives": ["reward_mean"],
+        "state_features": ["grad_info"],
+        "n_steps": 10,
+    }
+
+    env = AutoRLEnv(config=config)
+    init_obs, _ = env.reset()
+    assert len(init_obs.keys()) == 0
+
+    action = env.config_space.get_default_configuration()
+    obs, objectives, _, trunc, _ = env.step(action)
+    assert len(obs.keys()) == 2
+    assert obs["steps"].shape == (2,)
+    assert obs["grad_info"].shape == (2,)
+    assert trunc is False
+    assert objectives["reward_mean"] > -2000
+
+
+def test_autorl_env_dqn_per_switch():
+    config = {
+        "seed": 42,
+        "env_framework": "gymnax",
+        "env_name": "CartPole-v1",
+        "n_envs": 10,
+        "algorithm": "dqn",
+        "cnn_policy": False,
+        "n_total_timesteps": 1e6,
+        "n_eval_steps": 10,
+        "checkpoint": [],
+        "objectives": ["reward_mean"],
+        "state_features": [],
+        "n_steps": 10,
+    }
+
+    env = AutoRLEnv(config)
+    _, _ = env.reset()
+    action = env.config_space.get_default_configuration()
+
+    action["buffer_prio_sampling"] = True
+    _, objectives, _, _, _ = env.step(action)
+    assert objectives["reward_mean"] > 100
+
+    action["buffer_prio_sampling"] = False
+    _, objectives, _, _, _ = env.step(action)
+    assert objectives["reward_mean"] > 150
+
+    action["buffer_prio_sampling"] = True
+    _, objectives, _, _, _ = env.step(action)
+    assert objectives["reward_mean"] > 200
+
+    _, _ = env.reset()
+    action["buffer_prio_sampling"] = False
+    _, objectives, _, _, _ = env.step(action)
+    assert objectives["reward_mean"] > 200
+
+    action["buffer_prio_sampling"] = True
+    _, objectives, _, _, _ = env.step(action)
+    assert objectives["reward_mean"] > 200
+
+    action["buffer_prio_sampling"] = False
+    _, objectives, _, _, _ = env.step(action)
+    assert objectives["reward_mean"] > 200
+
+
+def test_autorl_env_dqn_dac():
+    config = {
+        "seed": 42,
+        "env_framework": "gymnax",
+        "env_name": "CartPole-v1",
+        "n_envs": 10,
+        "algorithm": "dqn",
+        "cnn_policy": False,
+        "n_total_timesteps": 1e6,
+        "n_eval_steps": 10,
+        "checkpoint": [],
+        "objectives": ["reward_mean"],
+        "state_features": [],
+        "n_steps": 3,
+    }
+
+    env = AutoRLEnv(config)
+    # perform 3 HPO steps
+    for _ in range(3):
+        _, _ = env.reset()
+        steps = 0
+        trunc = False
+        while not trunc:
+            action = env.config_space.sample_configuration()
+
+            obs, objectives, _, trunc, _ = env.step(action)
+            steps += 1
+            assert len(obs.keys()) == 1
+            assert obs["steps"].shape == (2,)
+            assert objectives["reward_mean"] > 0
+        assert trunc is True
+        assert steps == 3
+
+
+def test_autorl_env_dqn_hpo():
+    config = {
+        "seed": 42,
+        "env_framework": "gymnax",
+        "env_name": "CartPole-v1",
+        "n_envs": 10,
+        "algorithm": "dqn",
+        "cnn_policy": False,
+        "n_total_timesteps": 1e5,
+        "n_eval_steps": 10,
+        "checkpoint": [],
+        "objectives": ["reward_mean"],
+        "state_features": [],
+        "n_steps": 1,  # Classic (static) HPO
+    }
+
+    env = AutoRLEnv(config)
+
+    _, _ = env.reset()
+    action = env.config_space.sample_configuration()
+    obs, objectives, _, trunc, _ = env.step(action)
+    assert len(obs.keys()) == 1
+    assert obs["steps"].shape == (2,)
+    assert objectives["reward_mean"] > 0
+    assert trunc is True
+
+
+def test_autorl_env_step_before_reset():
+    config = {
+        "seed": 42,
+        "env_framework": "gymnax",
+        "env_name": "CartPole-v1",
+        "n_envs": 10,
+        "algorithm": "dqn",
+        "cnn_policy": False,
+        "n_total_timesteps": 1e6,
+        "n_eval_steps": 10,
+        "checkpoint": [],
+        "objectives": ["reward_mean"],
+        "state_features": [],
+        "n_steps": 1,  # Classic HPO
+    }
+
+    env = AutoRLEnv(config)
+
+    with pytest.raises(ValueError) as excinfo:
+        action = dict(DQN.get_hpo_config_space().sample_configuration())
+        env.step(action)
+
+    assert "Called step() before reset()" in str(excinfo.value)
+
+
+def test_autorl_env_forbidden_step():
+    config = {
+        "seed": 42,
+        "env_framework": "gymnax",
+        "env_name": "CartPole-v1",
+        "n_envs": 10,
+        "algorithm": "dqn",
+        "cnn_policy": False,
+        "n_total_timesteps": 1e5,
+        "n_eval_steps": 10,
+        "checkpoint": [],
+        "objectives": ["reward_mean"],
+        "state_features": [],
+        "n_steps": 1,  # Classic HPO
+    }
+
+    env = AutoRLEnv(config)
+    env.reset()
+    action = env.config_space.sample_configuration()
+    env.step(action)
+
+    with pytest.raises(ValueError) as excinfo:
+        env.step(action)
+
+    assert "Called step() before reset()" in str(excinfo.value)
+
+
+if __name__ == "__main__":
+    test_autorl_env_dqn_per_switch()
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""Unit test package for autorl subpackage."""`