From 496e0b0e29bcfb48fc609d58ffb46c4f99e67bb4 Mon Sep 17 00:00:00 2001
From: Patrick <>
Date: Sat, 8 Jun 2024 10:16:01 +0200
Subject: [PATCH 01/52] adjust version

---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index 8acdd82..7b32b52 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.0.1
+0.1.0.dev

From 01358eed96ed22e4ffc8d5e3e58459bc5b3df6f1 Mon Sep 17 00:00:00 2001
From: Patrick Hopf <>
Date: Thu, 27 Jun 2024 22:27:24 +0200
Subject: [PATCH 02/52] prepare for beta release

---
 .github/workflows/build.yml | 211 ++++++------------------------------
 MANIFEST.in                 |   1 +
 requirements.txt            |   8 +-
 setup.cfg                   |   6 +-
 4 files changed, 39 insertions(+), 187 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d99f56f..3db5b94 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -11,199 +11,50 @@ on:
         default: ""
 
 jobs:
-  # The deploy_test job is part of the test of whether we should deploy to PyPI
-  # or test.PyPI. The job will succeed if either the confirmation reference is
-  # empty, 'test' or if the confirmation is the selected branch or tag name.
-  # It will fail if it is nonempty and does not match.  All later jobs depend
-  # on this job, so that they will be immediately cancelled if the confirmation
-  # is bad.  The dependency is currently necessary (2021-03) because GitHub
-  # Actions does not have a simpler method of cancelling an entire workflow---
-  # the normal use-case expects to try and run as much as possible despite one
-  # or two failures.
-  deploy_test:
-    name: Verify PyPI deployment confirmation
-    runs-on: ubuntu-latest
-    env:
-      GITHUB_REF: ${{ github.ref }}
-      CONFIRM_REF: ${{ github.event.inputs.confirm_ref }}
-    steps:
-      - name: Compare confirmation to current reference
-        shell: bash
-        run: |
-          [[ -z $CONFIRM_REF || $GITHUB_REF =~ ^refs/(heads|tags)/$CONFIRM_REF$ || $CONFIRM_REF == "test" ]]
-          if [[ $CONFIRM_REF == "test" ]]; then
-            echo "Build and deploy to test.pypi.org."
-          elif [[ -z $CONFIRM_REF ]]; then
-            echo "Build only.  Nothing will be uploaded to PyPI."
-          else
-            echo "Full build and deploy.  Wheels and source will be uploaded to PyPI."
-          fi
-
-  build_sdist:
-    name: Build sdist on Ubuntu
-    needs: deploy_test
-    runs-on: ubuntu-latest
-    env:
-      OVERRIDE_VERSION: ${{ github.event.inputs.override_version }}
-
-    steps:
-      - uses: actions/checkout@v3
-
-      - uses: actions/setup-python@v4
-        name: Install Python
-        with:
-          # For the sdist we should be as conservative as possible with our
-          # Python version.  This should be the lowest supported version.  This
-          # means that no unsupported syntax can sneak through.
-          python-version: "3.10"
-
-      - name: Install pip build
-        run: |
-          python -m pip install 'build'
-
-      - name: Build sdist tarball
-        shell: bash
-        run: |
-          if [[ ! -z "$OVERRIDE_VERSION" ]]; then echo "$OVERRIDE_VERSION" > VERSION; fi
-          # The build package is the reference PEP 517 package builder.  All
-          # dependencies are specified by our setup code.
-          python -m build --sdist .
-
-      # Zip files are not part of PEP 517, so we need to make our own.
-      - name: Create zipfile from tarball
-        shell: bash
-        working-directory: dist
-        run: |
-          # First assert that there is exactly one tarball, and find its name.
-          shopt -s failglob
-          tarball_pattern="*.tar.gz"
-          tarballs=($tarball_pattern)
-          [[ ${#tarballs[@]} == 1 ]]
-          tarball="${tarballs[0]}"
-          # Get the stem and make the zipfile name.
-          stem="${tarball%.tar.gz}"
-          zipfile="${stem}.zip"
-          # Extract the tarball and rezip it.
-          tar -xzf "$tarball"
-          zip "$zipfile" -r "$stem"
-          rm -r "$stem"
-
-      - uses: actions/upload-artifact@v3
-        with:
-          name: sdist
-          path: |
-            dist/*.tar.gz
-            dist/*.zip
-          if-no-files-found: error
-
-  build_wheels:
-    name: Build wheels on ${{ matrix.os }}
-    needs: deploy_test
+  build:
+    name: Build distribution 📦
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
-    env:
-      # Set up wheels matrix.  This is CPython 3.10--3.12 for all OS targets.
-      CIBW_BUILD: "cp3{10,11,12}-*"
-      # Numpy and SciPy do not supply wheels for i686 or win32 for
-      # Python 3.10+, so we skip those:
-      CIBW_SKIP: "*-musllinux* cp3{10,11,12}-manylinux_i686 cp3{10,11,12}-win32"
-      OVERRIDE_VERSION: ${{ github.event.inputs.override_version }}
+        os: [ubuntu-latest, windows-latest, macOS-latest]
 
     steps:
-      - uses: actions/checkout@v3
-
-      - uses: actions/setup-python@v4
-        name: Install Python
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
         with:
-          # This is about the build environment, not the released wheel version.
           python-version: "3.10"
-
       - name: Install cibuildwheel
-        run: |
-          # cibuildwheel does the heavy lifting for us. Originally tested on
-          # 2.11.3, but should be fine at least up to any minor new release.
-          python -m pip install 'cibuildwheel==2.11.*'
-
-      - name: Build wheels
-        shell: bash
-        run: |
-          # If the version override was specified, then write it the VERSION
-          # file with it.
-          if [[ ! -z "$OVERRIDE_VERSION" ]]; then echo "$OVERRIDE_VERSION" > VERSION; fi
-          python -m cibuildwheel --output-dir wheelhouse
-
-      - uses: actions/upload-artifact@v3
+        run: >-
+          python3 -m
+          pip install
+          cibuildwheel
+          --user
+      - name: Build a binary wheel and a source tarball
+        run: python3 -m cibuildwheel --output-dir dist
+      - name: Store the distribution packages
+        uses: actions/upload-artifact@v3
         with:
-          name: wheels
-          path: ./wheelhouse/*.whl
+          name: python-package-distributions
+          path: dist/
 
-  deploy:
-    name: "Deploy to PyPI if desired"
-    # The confirmation is tested explicitly in `deploy_test`, so we know it is
-    # either a missing confirmation (so we shouldn't run this job), 'test' or a
-    # valid confirmation.  We don't need to retest the value of the
-    # confirmation, beyond checking that one existed.
+  publish-to-pypi:
+    name: Publish Python 🐍 distribution 📦 to PyPI
     if: ${{ github.event.inputs.confirm_ref != '' && github.event.inputs.confirm_ref != 'test' }}
-    needs: [deploy_test, build_sdist, build_wheels]
-    runs-on: ubuntu-latest
-    env:
-      TWINE_USERNAME: __token__
-      TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-      TWINE_NON_INTERACTIVE: 1
-      TWINE_REPOSITORY: pypi
-
-    steps:
-      - name: Download build artifacts to local runner
-        uses: actions/download-artifact@v3
-
-      - uses: actions/setup-python@v4
-        name: Install Python
-        with:
-          python-version: "3.10"
-
-      - name: Verify this is not a dev version
-        shell: bash
-        run: |
-          python -m pip install wheels/*-cp310-cp310-manylinux*.whl
-          python -c 'import qutip_qoc; print(qutip_qoc.__version__); assert "dev" not in qutip_qoc.__version__; assert "+" not in qutip_qoc.__version__'
-
-      # We built the zipfile for convenience distributing to Windows users on
-      # our end, but PyPI only needs the tarball.
-      - name: Upload sdist and wheels to PyPI
-        run: |
-          python -m pip install "twine"
-          python -m twine upload --verbose wheels/*.whl sdist/*.tar.gz
-
-  deploy_testpypi:
-    name: "Deploy to TestPyPI if desired"
-    if: ${{ github.event.inputs.confirm_ref == 'test' }}
-    needs: [deploy_test, build_sdist, build_wheels]
+    needs:
+      - build
     runs-on: ubuntu-latest
-    env:
-      TWINE_USERNAME: __token__
-      TWINE_PASSWORD: ${{ secrets.TESTPYPI_TOKEN }}
-      TWINE_NON_INTERACTIVE: 1
+    environment:
+      name: pypi
+      url: https://pypi.org/p/qutip-qoc
+    permissions:
+      id-token: write
 
     steps:
-      - name: Download build artifacts to local runner
+      - name: Download all the dists
         uses: actions/download-artifact@v3
-
-      - uses: actions/setup-python@v4
-        name: Install Python
         with:
-          python-version: "3.10"
-
-      - name: Verify this is not a dev version
-        shell: bash
-        run: |
-          python -m pip install wheels/*-cp310-cp310-manylinux*.whl
-          python -c 'import qutip_qoc; print(qutip_qoc.__version__); assert "dev" not in qutip_qoc.__version__; assert "+" not in qutip_qoc.__version__'
-
-      # We built the zipfile for convenience distributing to Windows users on
-      # our end, but PyPI only needs the tarball.
-      - name: Upload sdist and wheels to TestPyPI
-        run: |
-          python -m pip install "twine"
-          python -m twine upload --repository testpypi --verbose wheels/*.whl sdist/*.tar.gz
+          name: python-package-distributions
+          path: dist/
+      - name: Publish distribution 📦 to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/MANIFEST.in b/MANIFEST.in
index 4681b1f..7eac3bf 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
 include README.md
+include VERSION
 include LICENSE
 include requirements.txt
 include pyproject.toml
diff --git a/requirements.txt b/requirements.txt
index 70eaeb7..cea30e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
 cython>=1.0
 numpy>=1.16.6
 scipy>=1.10.1
-jax>=0.4.23
-jaxlib>=0.4.23
+jax==0.4.28
+jaxlib==0.4.28
 qutip>=5.0.1
-qutip-qtrl @ git+https://github.com/qutip/qutip-qtrl.git@master
-qutip-jax @ git+https://github.com/qutip/qutip-jax.git@master
+qutip-qtrl
+qutip-jax
 pre-commit
diff --git a/setup.cfg b/setup.cfg
index 1bb2a96..3d16ee1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -8,7 +8,7 @@ keywords = quantum, physics, dynamics
 license = BSD 3-Clause License
 license_files = LICENSE
 classifiers =
-    Development Status :: 2 - Pre-Alpha
+    Development Status :: 4 - Beta
     Intended Audience :: Science/Research
     License :: OSI Approved :: BSD License
     Programming Language :: Python
@@ -32,8 +32,8 @@ install_requires =
     jaxlib
     packaging
     qutip
-    qutip-qtrl @ git+https://github.com/qutip/qutip-qtrl.git@master
-    qutip-jax @ git+https://github.com/qutip/qutip-jax.git@master
+    qutip-qtrl
+    qutip-jax
 setup_requires =
     cython>=1.0
     packaging

From 191c819f921763db1daf211fbe41ab47429b0ab2 Mon Sep 17 00:00:00 2001
From: Patrick Hopf <>
Date: Sat, 29 Jun 2024 11:25:56 +0200
Subject: [PATCH 03/52] use patched qtrl for tests

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 69e1b03..2bf4a32 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -61,7 +61,7 @@ jobs:
       - name: Install qutip-qtrl from GitHub
         if: ${{ matrix.qutip-version == '@master'}}
         run: |
-          python -m pip install 'git+https://github.com/qutip/qutip-qtrl.git'
+          python -m pip install 'git+https://github.com/qutip/qutip-qtrl.git@patrick/patch_numpy2'
 
       - name: Install fixed version jax and jaxlib
         run: |

From 8b489b651fa889d75e721fb57d270848d2a1fd37 Mon Sep 17 00:00:00 2001
From: Patrick Hopf <>
Date: Sat, 29 Jun 2024 12:32:46 +0200
Subject: [PATCH 04/52] limit numpy version <2

---
 .github/workflows/test.yml | 2 +-
 doc/requirements.txt       | 2 +-
 doc/rtd-environment.yml    | 2 +-
 requirements.txt           | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2bf4a32..69e1b03 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -61,7 +61,7 @@ jobs:
       - name: Install qutip-qtrl from GitHub
         if: ${{ matrix.qutip-version == '@master'}}
         run: |
-          python -m pip install 'git+https://github.com/qutip/qutip-qtrl.git@patrick/patch_numpy2'
+          python -m pip install 'git+https://github.com/qutip/qutip-qtrl.git'
 
       - name: Install fixed version jax and jaxlib
         run: |
diff --git a/doc/requirements.txt b/doc/requirements.txt
index 48e2d81..1cf3d0a 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -1,4 +1,4 @@
-numpy>=1.16.6
+numpy>=1.16.6,<2.0
 scipy>=1.10.1
 jax==0.4.28
 jaxlib==0.4.28
diff --git a/doc/rtd-environment.yml b/doc/rtd-environment.yml
index d2e31c3..1d8a3db 100644
--- a/doc/rtd-environment.yml
+++ b/doc/rtd-environment.yml
@@ -2,7 +2,7 @@ name: rtd-environment
 channels:
   - conda-forge
 dependencies:
-  - numpy>=1.16.6
+  - numpy>=1.16.6,<2.0
   - scipy>=1.10.1
   - cython>=0.29.33
   - sphinx==6.1.3
diff --git a/requirements.txt b/requirements.txt
index cea30e5..a08565e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 cython>=1.0
-numpy>=1.16.6
+numpy>=1.16.6,<2.0
 scipy>=1.10.1
 jax==0.4.28
 jaxlib==0.4.28

From 5392f6ccf9b32b44cc42d4c14fc84e08539f41ce Mon Sep 17 00:00:00 2001
From: Patrick Hopf <>
Date: Sun, 30 Jun 2024 09:06:40 +0200
Subject: [PATCH 05/52] update setup with numpy version constraint

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 3d16ee1..3903195 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,6 +34,7 @@ install_requires =
     qutip
     qutip-qtrl
     qutip-jax
+    numpy>=1.16.6,<2.0
 setup_requires =
     cython>=1.0
     packaging

From 4f6a5a3e80df52baff0789beae6df1cf2f690ca1 Mon Sep 17 00:00:00 2001
From: Patrick Hopf <>
Date: Tue, 6 Aug 2024 16:40:45 +0200
Subject: [PATCH 06/52] some suggestions

---
 src/qutip_qoc/_rl.py         | 112 +++++++++++++++++++++++++++++++++++
 src/qutip_qoc/pulse_optim.py |  17 ++++++
 tests/test_result.py         |  37 ++++++++++++
 3 files changed, 166 insertions(+)
 create mode 100644 src/qutip_qoc/_rl.py

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
new file mode 100644
index 0000000..bb12900
--- /dev/null
+++ b/src/qutip_qoc/_rl.py
@@ -0,0 +1,112 @@
+"""
+This module contains ...
+"""
+import qutip as qt
+from qutip import Qobj, QobjEvo
+
+import numpy as np
+
+import gymnasium as gym
+from gymnasium import spaces
+from stable_baselines3 import PPO
+from stable_baselines3.common.env_checker import check_env
+
+
+class _RL(gym.Env): # TODO: this should be similar to your GymQubitEnv(gym.Env) implementation
+    """
+    Class for storing a control problem and ...
+    """
+
+    def __init__(
+        self,
+        objective,
+        time_interval,
+        time_options,
+        control_parameters,
+        alg_kwargs,
+        guess_params,
+        **integrator_kwargs,
+    ):
+        super().__init__() # TODO: super init your gym environment here
+
+        # ------------------------------- copied from _GOAT class -------------------------------
+        
+        # TODO: you dont have to use (or keep them) if you don't need the following attributes
+        # this is just an inspiration how to extract information from the input 
+
+        self._Hd = objective.H[0]
+        self._Hc_lst = objective.H[1:]
+
+        self._control_parameters = control_parameters
+        self._guess_params = guess_params
+        self._H = self._prepare_generator()
+
+        self._initial = objective.initial
+        self._target = objective.target
+
+        self._evo_time = time_interval.evo_time
+
+        # inferred attributes
+        self._norm_fac = 1 / self._target.norm()
+
+        # integrator options
+        self._integrator_kwargs = integrator_kwargs
+
+        self._rtol = self._integrator_kwargs.get("rtol", 1e-5)
+        self._atol = self._integrator_kwargs.get("atol", 1e-5)
+
+        # choose solver and fidelity type according to problem
+        if self._Hd.issuper:
+            self._fid_type = alg_kwargs.get("fid_type", "TRACEDIFF")
+            self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
+
+        else:
+            self._fid_type = alg_kwargs.get("fid_type", "PSU")
+            self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
+
+        self.infidelity = self._infid # TODO: should be used to calculate the reward
+
+        # ----------------------------------------------------------------------------------------
+        # TODO: set up your gym environment as you did correctly in post10
+        self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
+        self.max_steps = time_interval.n_tslots                         # maximum number of steps in an episode
+        self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole()
+        ...
+        
+        
+        # ----------------------------------------------------------------------------------------
+    
+    def _infid(self, params):
+        """
+        Calculate infidelity to be minimized
+        """
+        X = self._solver.run(
+            self._initial, [0.0, self._evo_time], args={"p": params}
+        ).final_state
+
+        if self._fid_type == "TRACEDIFF":
+            diff = X - self._target
+            # to prevent if/else in qobj.dag() and qobj.tr()
+            diff_dag = Qobj(diff.data.adjoint(), dims=diff.dims)
+            g = 1 / 2 * (diff_dag * diff).data.trace()
+            infid = np.real(self._norm_fac * g)
+        else:
+            g = self._norm_fac * self._target.overlap(X)
+            if self._fid_type == "PSU":  # f_PSU (drop global phase)
+                infid = 1 - np.abs(g)
+            elif self._fid_type == "SU":  # f_SU (incl global phase)
+                infid = 1 - np.real(g)
+
+        return infid
+
+    # TODO: don't hesitate to add the required methods for your rl environment
+
+    def step(self, action):
+        ...
+
+    def train(self):
+        ...
+
+    def result(self):
+        # TODO: return qoc.Result object with the optimized pulse amplitudes
+        ...
\ No newline at end of file
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 45866be..661c577 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -10,6 +10,7 @@
 
 from qutip_qoc._optimizer import _global_local_optimization
 from qutip_qoc._time import _TimeInterval
+from qutip_qoc._rl import _RL
 
 __all__ = ["optimize_pulses"]
 
@@ -348,6 +349,22 @@ def optimize_pulses(
 
             qtrl_optimizers.append(qtrl_optimizer)
 
+    # TODO: we can deal with proper handling later
+    if alg == "RL":
+        rl_env = _RL(
+            objectives,
+            control_parameters,
+            time_interval,
+            time_options,
+            algorithm_kwargs,
+            optimizer_kwargs,
+            minimizer_kwargs,
+            integrator_kwargs,
+            qtrl_optimizers,
+        )
+        rl_env.train()
+        return rl_env.result()
+
     return _global_local_optimization(
         objectives,
         control_parameters,
diff --git a/tests/test_result.py b/tests/test_result.py
index 0299e95..084aa74 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -152,6 +152,41 @@ def sin_z_jax(t, r, **kwargs):
     algorithm_kwargs={"alg": "CRAB", "fid_err_targ": 0.01, "fix_frequency": False},
 )
 
+# ----------------------- RL --------------------
+# TODO: this is the input for optimiz_pulses() function
+# you can use this routine to test your implementation
+
+# state to state transfer
+init = qt.basis(2, 0)
+target = qt.basis(2, 1)
+
+H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()] # control Hamiltonians
+
+w, d, y = 0.1, 1.0, 0.1
+H_d = 1 / 2 * (w * qt.sigmaz() + d * qt.sigmax()) # drift Hamiltonian
+
+H = [H_d] + H_c # total Hamiltonian
+
+state2state_rl = Case(
+    objectives=[Objective(initial, H, target)],
+    control_parameters={"bounds": [-13, 13]}, # TODO: for now only consider bounds
+    tlist=np.linspace(0, 10, 100), # TODO: derive single step duration and max evo time / max num steps from this
+    algorithm_kwargs={
+        "fid_err_targ": 0.01,
+        "alg": "RL",
+        "max_iter": 100,
+    }
+)
+
+# TODO: no big difference for unitary evolution
+
+initial = qt.qeye(2) # Identity
+target  = qt.gates.hadamard_transform()
+
+unitary_rl = state2state_rl._replace(
+    objectives=[Objective(initial, H, target)],
+)
+
 
 @pytest.fixture(
     params=[
@@ -160,6 +195,8 @@ def sin_z_jax(t, r, **kwargs):
         pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
         pytest.param(state2state_goat, id="State to state (GOAT)"),
         pytest.param(state2state_jax, id="State to state (JAX)"),
+        pytest.param(state2state_rl, id="State to state (RL)"),
+        pytest.param(unitary_rl, id="Unitary (RL)"),
     ]
 )
 def tst(request):

From 1b86b141b1033b1e3cf74e49084158012e7c0c9d Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Fri, 9 Aug 2024 00:00:27 +0200
Subject: [PATCH 07/52] state2state_rl implementation

---
 src/qutip_qoc/_rl.py         | 144 ++++++++++++++++++++++++++++-------
 src/qutip_qoc/pulse_optim.py |   2 +-
 tests/test_result.py         |  34 +++++----
 3 files changed, 137 insertions(+), 43 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index bb12900..94989c0 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -3,6 +3,7 @@
 """
 import qutip as qt
 from qutip import Qobj, QobjEvo
+from qutip_qoc import Result
 
 import numpy as np
 
@@ -11,6 +12,7 @@
 from stable_baselines3 import PPO
 from stable_baselines3.common.env_checker import check_env
 
+import time
 
 class _RL(gym.Env): # TODO: this should be similar to your GymQubitEnv(gym.Env) implementation
     """
@@ -19,50 +21,83 @@ class _RL(gym.Env): # TODO: this should be similar to your GymQubitEnv(gym.Env)
 
     def __init__(
         self,
-        objective,
+        objectives,
+        control_parameters,
         time_interval,
         time_options,
-        control_parameters,
         alg_kwargs,
-        guess_params,
-        **integrator_kwargs,
+        optimizer_kwargs,
+        minimizer_kwargs,
+        integrator_kwargs,
+        qtrl_optimizers,
     ):
-        super().__init__() # TODO: super init your gym environment here
+        super(_RL,self).__init__() # TODO:(ok) super init your gym environment here
 
         # ------------------------------- copied from _GOAT class -------------------------------
         
         # TODO: you dont have to use (or keep them) if you don't need the following attributes
         # this is just an inspiration how to extract information from the input 
-
-        self._Hd = objective.H[0]
-        self._Hc_lst = objective.H[1:]
+        self._Hd_lst, self._Hc_lst = [], []
+        if not isinstance(objectives, list):
+            objectives = [objectives]
+        for objective in objectives:
+            # extract drift and control Hamiltonians from the objective
+            self._Hd_lst.append(objective.H[0])
+            self._Hc_lst.append([H[0] if isinstance(H, list) else H for H in objective.H[1:]])
 
         self._control_parameters = control_parameters
-        self._guess_params = guess_params
-        self._H = self._prepare_generator()
+        # extract bounds for _control_parameters
+        bounds = []
+        for key in control_parameters.keys():
+            bounds.append(control_parameters[key].get("bounds"))
+        self.lbound = [b[0][0] for b in bounds]
+        self.ubound = [b[0][1] for b in bounds]
+
+        #self._H = self._prepare_generator()
 
         self._initial = objective.initial
         self._target = objective.target
+        self.state = None
+
+        self._result = Result(
+            objectives = objectives,
+            time_interval = time_interval,
+            start_local_time = time.localtime(),    # initial optimization time
+            n_iters = 0,                            # Number of iterations(episodes) until convergence 
+            iter_seconds = [],                      # list containing the time taken for each iteration(episode) of the optimization
+            var_time = False,                       # Whether the optimization was performed with variable time
+        )
+
+        #for the reward
+        self._step_penalty = 1
 
-        self._evo_time = time_interval.evo_time
+        # To check if it exceeds the maximum number of steps in an episode
+        self.current_step = 0
+
+        #self._evo_time = time_interval.evo_time
+        self._fid_err_targ = alg_kwargs["fid_err_targ"]
 
         # inferred attributes
         self._norm_fac = 1 / self._target.norm()
 
+        # control function
+        self.pulse = self._pulse
+
+        # contains the action of the last completed or truncated episode
+        self.action = None
+
         # integrator options
         self._integrator_kwargs = integrator_kwargs
-
         self._rtol = self._integrator_kwargs.get("rtol", 1e-5)
         self._atol = self._integrator_kwargs.get("atol", 1e-5)
 
         # choose solver and fidelity type according to problem
-        if self._Hd.issuper:
+        if self._Hd_lst[0].issuper:
             self._fid_type = alg_kwargs.get("fid_type", "TRACEDIFF")
-            self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
-
+            #self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
         else:
             self._fid_type = alg_kwargs.get("fid_type", "PSU")
-            self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
+            #self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
 
         self.infidelity = self._infid # TODO: should be used to calculate the reward
 
@@ -70,18 +105,25 @@ def __init__(
         # TODO: set up your gym environment as you did correctly in post10
         self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
         self.max_steps = time_interval.n_tslots                         # maximum number of steps in an episode
-        self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole()
-        ...
-        
+        self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
+        self.max_episodes = alg_kwargs["max_iter"]                      # maximum number of episodes for training
+        self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
         
+        # Define action and observation spaces (Gym)
+        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # Continuous action space from -1 to +1, as suggested from gym
+        self.observation_space = spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float32)  # Observation space, |v> have 2 real and 2 imaginary numbers -> 4
         # ----------------------------------------------------------------------------------------
     
-    def _infid(self, params):
+    def _pulse(self, t, p):
+        return p
+
+    def _infid(self, params=None):
         """
         Calculate infidelity to be minimized
         """
+        '''
         X = self._solver.run(
-            self._initial, [0.0, self._evo_time], args={"p": params}
+            self.state, [0.0, self.step_duration], args={"p": params}
         ).final_state
 
         if self._fid_type == "TRACEDIFF":
@@ -96,17 +138,65 @@ def _infid(self, params):
                 infid = 1 - np.abs(g)
             elif self._fid_type == "SU":  # f_SU (incl global phase)
                 infid = 1 - np.real(g)
-
+        '''
+        infid = 1 - qt.fidelity(self.state, self._target)
         return infid
 
     # TODO: don't hesitate to add the required methods for your rl environment
 
     def step(self, action):
-        ...
+        action = action[0]
+        alpha = ((action + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0]   # for example, scale action from [-1, 1] to [9, 13]
+        args = {"alpha" : alpha}
+
+        H = [self._Hd_lst[0], [self._Hc_lst[0][0], lambda t, args: self.pulse(t, args["alpha"])]]
+        step_result = qt.mesolve(H, self.state, [0, self.step_duration], args = args)
+        self.state =  step_result.states[-1]
+
+        infidelity = self.infidelity()
+        self._result.infidelity = infidelity
+        reward = (1 - infidelity) - self._step_penalty
+
+        terminated = infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
+        truncated = self.current_step >= self.max_steps                     # if the episode ended without reaching the goal
+
+        if terminated or truncated:
+            time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
+            self._result.iter_seconds.append(time_diff)
+            self.current_step = 0                                           # Reset the step counter
+            self.action = alpha
+
+        observation = self._get_obs()
+        return observation, reward, bool(terminated), bool(truncated), {}
+
+    def _get_obs(self):
+        rho = self.state.full().flatten()                                   # to have state vector as NumPy array and flatten into one dimensional array.[a+i*b c+i*d]
+        obs = np.concatenate((np.real(rho), np.imag(rho)))
+        return obs.astype(np.float32)                                       # Gymnasium expects the observation to be of type float32
+    
+    def reset(self, seed=None):
+        self.state = self._initial
+        return self._get_obs(), {}
+    
+    def result(self):
+        # TODO: return qoc.Result object with the optimized pulse amplitudes
+        self._result.message = "Optimization finished!"
+        self._result.end_local_time = time.localtime()
+        self._result.n_iters = len(self._result.iter_seconds)  
+        self._result.optimized_params = np.array(self.action)
+        self._result._optimized_controls = np.array(self.action)    #TODO: is equal to optimized_params ?
+        self._result._final_states = (self._result._final_states if self._result._final_states is not None else []) + [self.state]
+        self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
+        self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
+        self._result._guess_controls = []
+        return self._result
 
     def train(self):
-        ...
+        # Check if the environment follows Gym API
+        check_env(self, warn=True)
 
-    def result(self):
-        # TODO: return qoc.Result object with the optimized pulse amplitudes
-        ...
\ No newline at end of file
+        # Create the model
+        model = PPO('MlpPolicy', self, verbose=1)       # verbose = 1 to show training in terminal
+
+        # Train the model
+        model.learn(total_timesteps = self.total_timesteps)
\ No newline at end of file
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 661c577..573a51d 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -152,7 +152,7 @@ def optimize_pulses(
     # extract guess and bounds for the control pulses
     x0, bounds = [], []
     for key in control_parameters.keys():
-        x0.append(control_parameters[key].get("guess"))
+        x0.append(control_parameters[key].get("guess"))         # TODO: for now only consider bounds
         bounds.append(control_parameters[key].get("bounds"))
     try:  # GRAPE, CRAB format
         lbound = [b[0][0] for b in bounds]
diff --git a/tests/test_result.py b/tests/test_result.py
index 084aa74..a7460d7 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -157,7 +157,7 @@ def sin_z_jax(t, r, **kwargs):
 # you can use this routine to test your implementation
 
 # state to state transfer
-init = qt.basis(2, 0)
+initial = qt.basis(2, 0)
 target = qt.basis(2, 1)
 
 H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()] # control Hamiltonians
@@ -169,34 +169,38 @@ def sin_z_jax(t, r, **kwargs):
 
 state2state_rl = Case(
     objectives=[Objective(initial, H, target)],
-    control_parameters={"bounds": [-13, 13]}, # TODO: for now only consider bounds
+    #control_parameters={"bounds": [-13, 13]}, # TODO: for now only consider bounds
+    control_parameters = {
+        "p": {"bounds": [(-13, 13)],}
+    },
     tlist=np.linspace(0, 10, 100), # TODO: derive single step duration and max evo time / max num steps from this
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
-        "max_iter": 100,
-    }
+        "max_iter": 700,
+    },
+    optimizer_kwargs={}
 )
 
 # TODO: no big difference for unitary evolution
 
-initial = qt.qeye(2) # Identity
-target  = qt.gates.hadamard_transform()
+#initial = qt.qeye(2) # Identity
+#target  = qt.gates.hadamard_transform()
 
-unitary_rl = state2state_rl._replace(
-    objectives=[Objective(initial, H, target)],
-)
+#unitary_rl = state2state_rl._replace(
+#    objectives=[Objective(initial, H, target)],
+#)
 
 
 @pytest.fixture(
     params=[
-        pytest.param(state2state_grape, id="State to state (GRAPE)"),
-        pytest.param(state2state_crab, id="State to state (CRAB)"),
-        pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
-        pytest.param(state2state_goat, id="State to state (GOAT)"),
-        pytest.param(state2state_jax, id="State to state (JAX)"),
+        #pytest.param(state2state_grape, id="State to state (GRAPE)"),
+        #pytest.param(state2state_crab, id="State to state (CRAB)"),
+        #pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
+        #pytest.param(state2state_goat, id="State to state (GOAT)"),
+        #pytest.param(state2state_jax, id="State to state (JAX)"),
         pytest.param(state2state_rl, id="State to state (RL)"),
-        pytest.param(unitary_rl, id="Unitary (RL)"),
+        #pytest.param(unitary_rl, id="Unitary (RL)"),
     ]
 )
 def tst(request):

From 2c6c3025f63a35a6315437499b268b32c12e0d28 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sat, 10 Aug 2024 21:36:20 +0200
Subject: [PATCH 08/52] Using QobjEvo with Hc, Hd and controls. Dimension
 action space variable based on how many controls there are. using mesolve
 with QobjEvo object _H

---
 src/qutip_qoc/_rl.py | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 94989c0..315e074 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -45,6 +45,13 @@ def __init__(
             self._Hd_lst.append(objective.H[0])
             self._Hc_lst.append([H[0] if isinstance(H, list) else H for H in objective.H[1:]])
 
+        # create the QobjEvo with Hd, Hc and controls(args)
+        self.args = {f"alpha{i+1}": (1) for i in range(len(self._Hc_lst[0]))}    # set the control parameters to 1 fo all the Hc
+        self._H_lst = [self._Hd_lst[0]]
+        for i, Hc in enumerate(self._Hc_lst[0]):
+            self._H_lst.append([Hc, lambda t, args: self.pulse(t, self.args, i+1)])
+        self._H = qt.QobjEvo(self._H_lst, self.args)
+
         self._control_parameters = control_parameters
         # extract bounds for _control_parameters
         bounds = []
@@ -80,9 +87,6 @@ def __init__(
         # inferred attributes
         self._norm_fac = 1 / self._target.norm()
 
-        # control function
-        self.pulse = self._pulse
-
         # contains the action of the last completed or truncated episode
         self.action = None
 
@@ -110,12 +114,15 @@ def __init__(
         self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
         
         # Define action and observation spaces (Gym)
-        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # Continuous action space from -1 to +1, as suggested from gym
+        #self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # Continuous action space from -1 to +1, as suggested from gym
+        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)
         self.observation_space = spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float32)  # Observation space, |v> have 2 real and 2 imaginary numbers -> 4
         # ----------------------------------------------------------------------------------------
     
-    def _pulse(self, t, p):
-        return p
+    #def _pulse(self, t, p):
+    #    return p
+    def pulse(self, t, args, idx):
+        return 1*args[f"alpha{idx}"]
 
     def _infid(self, params=None):
         """
@@ -145,12 +152,16 @@ def _infid(self, params=None):
     # TODO: don't hesitate to add the required methods for your rl environment
 
     def step(self, action):
-        action = action[0]
-        alpha = ((action + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0]   # for example, scale action from [-1, 1] to [9, 13]
-        args = {"alpha" : alpha}
+        alphas = [((action[i] + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0] for i in range(len(action))]   #TODO: use ubound[i] lbound[i] 
+
+        for i, value in enumerate(alphas):
+            self.args[f"alpha{i+1}"] = value
+        self._H = qt.QobjEvo(self._H_lst, self.args)
+
+        #H = [self._Hd_lst[0], [self._Hc_lst[0][0], lambda t, args: self.pulse(t, args["alpha"])]]
+        #step_result = qt.mesolve(H, self.state, [0, self.step_duration], args = args)
+        step_result = qt.mesolve(self._H, self.state, [0, self.step_duration])
 
-        H = [self._Hd_lst[0], [self._Hc_lst[0][0], lambda t, args: self.pulse(t, args["alpha"])]]
-        step_result = qt.mesolve(H, self.state, [0, self.step_duration], args = args)
         self.state =  step_result.states[-1]
 
         infidelity = self.infidelity()
@@ -164,7 +175,7 @@ def step(self, action):
             time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
             self._result.iter_seconds.append(time_diff)
             self.current_step = 0                                           # Reset the step counter
-            self.action = alpha
+            self.action = alphas
 
         observation = self._get_obs()
         return observation, reward, bool(terminated), bool(truncated), {}

From 973095976abd7cc7840ae5e61f97be4a1d1f09f5 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sun, 11 Aug 2024 21:42:37 +0200
Subject: [PATCH 09/52] added _solver. _infidelity() more general

---
 src/qutip_qoc/_rl.py | 54 +++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 315e074..832c44c 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -46,7 +46,7 @@ def __init__(
             self._Hc_lst.append([H[0] if isinstance(H, list) else H for H in objective.H[1:]])
 
         # create the QobjEvo with Hd, Hc and controls(args)
-        self.args = {f"alpha{i+1}": (1) for i in range(len(self._Hc_lst[0]))}    # set the control parameters to 1 fo all the Hc
+        self.args = {f"alpha{i+1}": (1) for i in range(len(self._Hc_lst[0]))}    # set the control parameters to 1 for all the Hc
         self._H_lst = [self._Hd_lst[0]]
         for i, Hc in enumerate(self._Hc_lst[0]):
             self._H_lst.append([Hc, lambda t, args: self.pulse(t, self.args, i+1)])
@@ -61,6 +61,7 @@ def __init__(
         self.ubound = [b[0][1] for b in bounds]
 
         #self._H = self._prepare_generator()
+        self._alg_kwargs = alg_kwargs
 
         self._initial = objective.initial
         self._target = objective.target
@@ -87,24 +88,14 @@ def __init__(
         # inferred attributes
         self._norm_fac = 1 / self._target.norm()
 
-        # contains the action of the last completed or truncated episode
-        self.action = None
+        self.temp_actions = []                  # temporary list to save episode actions
+        self.actions = []                       # list of actions(lists) of the last episode
 
         # integrator options
         self._integrator_kwargs = integrator_kwargs
         self._rtol = self._integrator_kwargs.get("rtol", 1e-5)
         self._atol = self._integrator_kwargs.get("atol", 1e-5)
 
-        # choose solver and fidelity type according to problem
-        if self._Hd_lst[0].issuper:
-            self._fid_type = alg_kwargs.get("fid_type", "TRACEDIFF")
-            #self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
-        else:
-            self._fid_type = alg_kwargs.get("fid_type", "PSU")
-            #self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
-
-        self.infidelity = self._infid # TODO: should be used to calculate the reward
-
         # ----------------------------------------------------------------------------------------
         # TODO: set up your gym environment as you did correctly in post10
         self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
@@ -118,7 +109,18 @@ def __init__(
         self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)
         self.observation_space = spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float32)  # Observation space, |v> have 2 real and 2 imaginary numbers -> 4
         # ----------------------------------------------------------------------------------------
-    
+        
+    def update_solver(self): 
+        # choose solver and fidelity type according to problem
+        if self._Hd_lst[0].issuper:
+            self._fid_type = self._alg_kwargs.get("fid_type", "TRACEDIFF")
+            self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
+        else:
+            self._fid_type = self._alg_kwargs.get("fid_type", "PSU")
+            self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
+
+        self.infidelity = self._infid # TODO: should be used to calculate the reward
+
     #def _pulse(self, t, p):
     #    return p
     def pulse(self, t, args, idx):
@@ -128,10 +130,10 @@ def _infid(self, params=None):
         """
         Calculate infidelity to be minimized
         """
-        '''
         X = self._solver.run(
             self.state, [0.0, self.step_duration], args={"p": params}
         ).final_state
+        self.state = X
 
         if self._fid_type == "TRACEDIFF":
             diff = X - self._target
@@ -145,8 +147,7 @@ def _infid(self, params=None):
                 infid = 1 - np.abs(g)
             elif self._fid_type == "SU":  # f_SU (incl global phase)
                 infid = 1 - np.real(g)
-        '''
-        infid = 1 - qt.fidelity(self.state, self._target)
+        #infid = 1 - qt.fidelity(self.state, self._target)
         return infid
 
     # TODO: don't hesitate to add the required methods for your rl environment
@@ -158,13 +159,11 @@ def step(self, action):
             self.args[f"alpha{i+1}"] = value
         self._H = qt.QobjEvo(self._H_lst, self.args)
 
-        #H = [self._Hd_lst[0], [self._Hc_lst[0][0], lambda t, args: self.pulse(t, args["alpha"])]]
-        #step_result = qt.mesolve(H, self.state, [0, self.step_duration], args = args)
-        step_result = qt.mesolve(self._H, self.state, [0, self.step_duration])
-
-        self.state =  step_result.states[-1]
-
+        self.update_solver()                # _H has changed
         infidelity = self.infidelity()
+
+        self.current_step += 1
+        self.temp_actions.append(alphas)
         self._result.infidelity = infidelity
         reward = (1 - infidelity) - self._step_penalty
 
@@ -175,7 +174,7 @@ def step(self, action):
             time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
             self._result.iter_seconds.append(time_diff)
             self.current_step = 0                                           # Reset the step counter
-            self.action = alphas
+            self.actions = self.temp_actions.copy()
 
         observation = self._get_obs()
         return observation, reward, bool(terminated), bool(truncated), {}
@@ -186,6 +185,7 @@ def _get_obs(self):
         return obs.astype(np.float32)                                       # Gymnasium expects the observation to be of type float32
     
     def reset(self, seed=None):
+        self.temp_actions = []
         self.state = self._initial
         return self._get_obs(), {}
     
@@ -194,12 +194,14 @@ def result(self):
         self._result.message = "Optimization finished!"
         self._result.end_local_time = time.localtime()
         self._result.n_iters = len(self._result.iter_seconds)  
-        self._result.optimized_params = np.array(self.action)
-        self._result._optimized_controls = np.array(self.action)    #TODO: is equal to optimized_params ?
+        self._result.optimized_params = self.actions.copy()
+        self._result._optimized_controls = self.actions.copy()
         self._result._final_states = (self._result._final_states if self._result._final_states is not None else []) + [self.state]
         self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
         self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
         self._result._guess_controls = []
+        self._result._optimized_H = [self._H]
+        self._result.guess_params = []
         return self._result
 
     def train(self):

From 2667b6403f6ed2134d44fc845aa359a556060f0d Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Wed, 14 Aug 2024 10:05:40 +0200
Subject: [PATCH 10/52] unitary evolution implementation

---
 src/qutip_qoc/_rl.py | 15 ++++++++++-----
 tests/test_result.py | 16 ++++++++--------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 832c44c..8961c2e 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -63,9 +63,10 @@ def __init__(
         #self._H = self._prepare_generator()
         self._alg_kwargs = alg_kwargs
 
-        self._initial = objective.initial
-        self._target = objective.target
+        self._initial = objectives[0].initial
+        self._target = objectives[0].target
         self.state = None
+        self.dim = self._initial.shape[0]
 
         self._result = Result(
             objectives = objectives,
@@ -105,9 +106,13 @@ def __init__(
         self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
         
         # Define action and observation spaces (Gym)
-        #self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # Continuous action space from -1 to +1, as suggested from gym
-        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)
-        self.observation_space = spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float32)  # Observation space, |v> have 2 real and 2 imaginary numbers -> 4
+        #self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  
+        if self._initial.isket:
+            obs_shape = (2 * self.dim,)
+        else:   # for unitary operators 
+            obs_shape = (2 * self.dim * self.dim,)
+        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)     # Continuous action space from -1 to +1, as suggested from gym
+        self.observation_space = spaces.Box(low=-1, high=1, shape=obs_shape, dtype=np.float32)              # Observation space
         # ----------------------------------------------------------------------------------------
         
     def update_solver(self): 
diff --git a/tests/test_result.py b/tests/test_result.py
index a7460d7..10fb8bf 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -182,14 +182,14 @@ def sin_z_jax(t, r, **kwargs):
     optimizer_kwargs={}
 )
 
-# TODO: no big difference for unitary evolution
+# no big difference for unitary evolution
 
-#initial = qt.qeye(2) # Identity
-#target  = qt.gates.hadamard_transform()
+initial = qt.qeye(2) # Identity
+target  = qt.gates.hadamard_transform()
 
-#unitary_rl = state2state_rl._replace(
-#    objectives=[Objective(initial, H, target)],
-#)
+unitary_rl = state2state_rl._replace(
+    objectives=[Objective(initial, H, target)],
+)
 
 
 @pytest.fixture(
@@ -199,8 +199,8 @@ def sin_z_jax(t, r, **kwargs):
         #pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
         #pytest.param(state2state_goat, id="State to state (GOAT)"),
         #pytest.param(state2state_jax, id="State to state (JAX)"),
-        pytest.param(state2state_rl, id="State to state (RL)"),
-        #pytest.param(unitary_rl, id="Unitary (RL)"),
+        #pytest.param(state2state_rl, id="State to state (RL)"),
+        pytest.param(unitary_rl, id="Unitary (RL)"),
     ]
 )
 def tst(request):

From 19436092cbe89004cd1c8424018c827bc7576059 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sun, 18 Aug 2024 19:51:28 +0200
Subject: [PATCH 11/52] docstring added

---
 src/qutip_qoc/_rl.py | 73 ++++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 26 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 8961c2e..8434fe6 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -1,5 +1,7 @@
 """
-This module contains ...
+This module contains functions that implement quantum optimal control 
+using reinforcement learning (RL) techniques, allowing for the optimization 
+of control pulse sequences in quantum systems.
 """
 import qutip as qt
 from qutip import Qobj, QobjEvo
@@ -14,9 +16,13 @@
 
 import time
 
-class _RL(gym.Env): # TODO: this should be similar to your GymQubitEnv(gym.Env) implementation
+class _RL(gym.Env):
     """
-    Class for storing a control problem and ...
+    Class for storing a control problem and implementing quantum optimal 
+    control using reinforcement learning. This class defines a custom 
+    Gym environment that models the dynamics of quantum systems 
+    under various control pulses, and uses RL algorithms to optimize the 
+    parameters of these pulses.
     """
 
     def __init__(
@@ -31,12 +37,14 @@ def __init__(
         integrator_kwargs,
         qtrl_optimizers,
     ):
-        super(_RL,self).__init__() # TODO:(ok) super init your gym environment here
+        """
+        Initialize the reinforcement learning environment for quantum 
+        optimal control. Sets up the system Hamiltonian, control parameters, 
+        and defines the observation and action spaces for the RL agent.
+        """
 
-        # ------------------------------- copied from _GOAT class -------------------------------
+        super(_RL,self).__init__()
         
-        # TODO: you dont have to use (or keep them) if you don't need the following attributes
-        # this is just an inspiration how to extract information from the input 
         self._Hd_lst, self._Hc_lst = [], []
         if not isinstance(objectives, list):
             objectives = [objectives]
@@ -60,7 +68,6 @@ def __init__(
         self.lbound = [b[0][0] for b in bounds]
         self.ubound = [b[0][1] for b in bounds]
 
-        #self._H = self._prepare_generator()
         self._alg_kwargs = alg_kwargs
 
         self._initial = objectives[0].initial
@@ -83,7 +90,6 @@ def __init__(
         # To check if it exceeds the maximum number of steps in an episode
         self.current_step = 0
 
-        #self._evo_time = time_interval.evo_time
         self._fid_err_targ = alg_kwargs["fid_err_targ"]
 
         # inferred attributes
@@ -97,8 +103,6 @@ def __init__(
         self._rtol = self._integrator_kwargs.get("rtol", 1e-5)
         self._atol = self._integrator_kwargs.get("atol", 1e-5)
 
-        # ----------------------------------------------------------------------------------------
-        # TODO: set up your gym environment as you did correctly in post10
         self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
         self.max_steps = time_interval.n_tslots                         # maximum number of steps in an episode
         self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
@@ -106,17 +110,19 @@ def __init__(
         self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
         
         # Define action and observation spaces (Gym)
-        #self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  
         if self._initial.isket:
             obs_shape = (2 * self.dim,)
         else:   # for unitary operators 
             obs_shape = (2 * self.dim * self.dim,)
         self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)     # Continuous action space from -1 to +1, as suggested from gym
         self.observation_space = spaces.Box(low=-1, high=1, shape=obs_shape, dtype=np.float32)              # Observation space
-        # ----------------------------------------------------------------------------------------
         
     def update_solver(self): 
-        # choose solver and fidelity type according to problem
+        """
+        Update the solver and fidelity type based on the problem setup.
+        Chooses the appropriate solver (Schrödinger or master equation) and 
+        prepares for infidelity calculation.
+        """
         if self._Hd_lst[0].issuper:
             self._fid_type = self._alg_kwargs.get("fid_type", "TRACEDIFF")
             self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
@@ -124,16 +130,17 @@ def update_solver(self):
             self._fid_type = self._alg_kwargs.get("fid_type", "PSU")
             self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
 
-        self.infidelity = self._infid # TODO: should be used to calculate the reward
+        self.infidelity = self._infid
 
-    #def _pulse(self, t, p):
-    #    return p
     def pulse(self, t, args, idx):
+        """
+        Returns the control pulse value at time t for a given index.
+        """
         return 1*args[f"alpha{idx}"]
 
     def _infid(self, params=None):
         """
-        Calculate infidelity to be minimized
+        The agent performs a step, then calculate infidelity to be minimized of the current state against the target state.
         """
         X = self._solver.run(
             self.state, [0.0, self.step_duration], args={"p": params}
@@ -152,13 +159,14 @@ def _infid(self, params=None):
                 infid = 1 - np.abs(g)
             elif self._fid_type == "SU":  # f_SU (incl global phase)
                 infid = 1 - np.real(g)
-        #infid = 1 - qt.fidelity(self.state, self._target)
         return infid
 
-    # TODO: don't hesitate to add the required methods for your rl environment
-
     def step(self, action):
-        alphas = [((action[i] + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0] for i in range(len(action))]   #TODO: use ubound[i] lbound[i] 
+        """
+        Perform a single time step in the environment, applying the scaled action (control pulse) 
+        chosen by the RL agent. Updates the system's state and computes the reward.
+        """
+        alphas = [((action[i] + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0] for i in range(len(action))] 
 
         for i, value in enumerate(alphas):
             self.args[f"alpha{i+1}"] = value
@@ -185,17 +193,27 @@ def step(self, action):
         return observation, reward, bool(terminated), bool(truncated), {}
 
     def _get_obs(self):
-        rho = self.state.full().flatten()                                   # to have state vector as NumPy array and flatten into one dimensional array.[a+i*b c+i*d]
+        """
+        Get the current state observation for the RL agent. Converts the system's 
+        quantum state or matrix into a real-valued NumPy array suitable for RL algorithms.
+        """
+        rho = self.state.full().flatten()
         obs = np.concatenate((np.real(rho), np.imag(rho)))
         return obs.astype(np.float32)                                       # Gymnasium expects the observation to be of type float32
     
     def reset(self, seed=None):
+        """
+        Reset the environment to the initial state, preparing for a new episode.
+        """
         self.temp_actions = []
         self.state = self._initial
         return self._get_obs(), {}
     
     def result(self):
-        # TODO: return qoc.Result object with the optimized pulse amplitudes
+        """
+        Retrieve the results of the optimization process, including the optimized 
+        pulse sequences, final states, and performance metrics.
+        """
         self._result.message = "Optimization finished!"
         self._result.end_local_time = time.localtime()
         self._result.n_iters = len(self._result.iter_seconds)  
@@ -210,11 +228,14 @@ def result(self):
         return self._result
 
     def train(self):
+        """
+        Train the RL agent on the defined quantum control problem using the specified 
+        reinforcement learning algorithm. Checks environment compatibility with Gym API.
+        """
         # Check if the environment follows Gym API
         check_env(self, warn=True)
 
         # Create the model
-        model = PPO('MlpPolicy', self, verbose=1)       # verbose = 1 to show training in terminal
-
+        model = PPO('MlpPolicy', self, verbose=1)       # verbose = 1 to display training progress and statistics in the terminal
         # Train the model
         model.learn(total_timesteps = self.total_timesteps)
\ No newline at end of file

From 13ddba0253afc2b76807d55ca65991f2b0d50d15 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Tue, 20 Aug 2024 23:11:54 +0200
Subject: [PATCH 12/52] added callback to stop training

---
 src/qutip_qoc/_rl.py | 94 ++++++++++++++++++++++++++++++++++++++------
 tests/test_result.py | 20 +++++-----
 2 files changed, 93 insertions(+), 21 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 8434fe6..943bcd3 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -13,6 +13,7 @@
 from gymnasium import spaces
 from stable_baselines3 import PPO
 from stable_baselines3.common.env_checker import check_env
+from stable_baselines3.common.callbacks import BaseCallback
 
 import time
 
@@ -90,6 +91,10 @@ def __init__(
         # To check if it exceeds the maximum number of steps in an episode
         self.current_step = 0
 
+        self.terminated = False
+        self.truncated = False
+        self.episode_info = []                # to contain some information from the latest episode
+
         self._fid_err_targ = alg_kwargs["fid_err_targ"]
 
         # inferred attributes
@@ -108,6 +113,7 @@ def __init__(
         self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
         self.max_episodes = alg_kwargs["max_iter"]                      # maximum number of episodes for training
         self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
+        self.current_episode = 0                                        # To keep track of the current episode
         
         # Define action and observation spaces (Gym)
         if self._initial.isket:
@@ -137,6 +143,19 @@ def pulse(self, t, args, idx):
         Returns the control pulse value at time t for a given index.
         """
         return 1*args[f"alpha{idx}"]
+    
+    def save_episode_info(self):
+        """
+        Save the information of the last episode before resetting the environment.
+        """
+        episode_data = {
+            "episode": self.current_episode,
+            "final_infidelity": self._result.infidelity,
+            "terminated": self.terminated,
+            "truncated": self.truncated,
+            "steps_used": self.current_step
+        }
+        self.episode_info.append(episode_data)
 
     def _infid(self, params=None):
         """
@@ -180,17 +199,11 @@ def step(self, action):
         self._result.infidelity = infidelity
         reward = (1 - infidelity) - self._step_penalty
 
-        terminated = infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
-        truncated = self.current_step >= self.max_steps                     # if the episode ended without reaching the goal
-
-        if terminated or truncated:
-            time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
-            self._result.iter_seconds.append(time_diff)
-            self.current_step = 0                                           # Reset the step counter
-            self.actions = self.temp_actions.copy()
+        self.terminated = infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
+        self.truncated = self.current_step >= self.max_steps                     # if the episode ended without reaching the goal
 
         observation = self._get_obs()
-        return observation, reward, bool(terminated), bool(truncated), {}
+        return observation, reward, bool(self.terminated), bool(self.truncated), {}
 
     def _get_obs(self):
         """
@@ -205,6 +218,15 @@ def reset(self, seed=None):
         """
         Reset the environment to the initial state, preparing for a new episode.
         """
+        self.save_episode_info()
+
+        time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
+        self._result.iter_seconds.append(time_diff)
+        self.current_step = 0                                           # Reset the step counter
+        self.current_episode += 1                                       # Increment episode counter
+        self.actions = self.temp_actions.copy()
+        self.terminated = False
+        self.truncated = False
         self.temp_actions = []
         self.state = self._initial
         return self._get_obs(), {}
@@ -214,7 +236,6 @@ def result(self):
         Retrieve the results of the optimization process, including the optimized 
         pulse sequences, final states, and performance metrics.
         """
-        self._result.message = "Optimization finished!"
         self._result.end_local_time = time.localtime()
         self._result.n_iters = len(self._result.iter_seconds)  
         self._result.optimized_params = self.actions.copy()
@@ -237,5 +258,56 @@ def train(self):
 
         # Create the model
         model = PPO('MlpPolicy', self, verbose=1)       # verbose = 1 to display training progress and statistics in the terminal
+        
+        stop_callback = EarlyStopTraining(verbose=1)
+        
         # Train the model
-        model.learn(total_timesteps = self.total_timesteps)
\ No newline at end of file
+        model.learn(total_timesteps = self.total_timesteps, callback=stop_callback)
+
+class EarlyStopTraining(BaseCallback):
+    """
+    A callback to stop training based on specific conditions (steps, infidelity, max iterations)
+    """
+    def __init__(self, verbose: int = 0):
+        super(EarlyStopTraining, self).__init__(verbose)
+        self.stop_train = False
+
+    def _on_step(self) -> bool:
+        """
+        This method is required by the BaseCallback class. We use it only to stop the training.
+        """
+        # Check if we need to stop training
+        if self.stop_train:
+            return False  # Stop training
+        return True  # Continue training
+
+    def _on_rollout_start(self) -> None:
+        """
+        This method is called before the rollout starts (before collecting new samples).
+        Checks:
+        - If all of the last 100 episodes have infidelity below the target and use the same number of steps, stop training.
+        - Stop training if the maximum number of episodes is reached.
+        """
+        env = self.training_env.envs[0].unwrapped
+        max_episodes = env.max_episodes
+        fid_err_targ = env._fid_err_targ
+
+        if len(env.episode_info) >= 100:
+            last_100_episodes = env.episode_info[-100:]
+
+            min_steps = min(info['steps_used'] for info in last_100_episodes)
+            steps_condition = all(ep['steps_used'] == min_steps for ep in last_100_episodes)
+            infid_condition = all(ep['final_infidelity'] <= fid_err_targ for ep in last_100_episodes)
+
+            if steps_condition and infid_condition:
+                env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
+                #print(f"Stopping training as no episode in the last 100 used fewer steps and infidelity was below target infid.")
+                self.stop_train = True  # Stop training
+                #print([ep['steps_used'] for ep in last_100_episodes])
+                #print([ep['final_infidelity'] for ep in last_100_episodes])
+
+        # Check max episodes condition
+        if env.current_episode >= max_episodes:
+            env._result.message = f"Reached {max_episodes} episodes, stopping training."
+            #print(f"Reached {max_episodes} episodes, stopping training.")
+            self.stop_train = True  # Stop training
diff --git a/tests/test_result.py b/tests/test_result.py
index 10fb8bf..f2ff583 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -158,7 +158,7 @@ def sin_z_jax(t, r, **kwargs):
 
 # state to state transfer
 initial = qt.basis(2, 0)
-target = qt.basis(2, 1)
+target = (qt.basis(2, 0) + qt.basis(2, 1)).unit() # |+⟩
 
 H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()] # control Hamiltonians
 
@@ -173,23 +173,23 @@ def sin_z_jax(t, r, **kwargs):
     control_parameters = {
         "p": {"bounds": [(-13, 13)],}
     },
-    tlist=np.linspace(0, 10, 100), # TODO: derive single step duration and max evo time / max num steps from this
+    tlist=np.linspace(0, 10, 100),
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
-        "max_iter": 700,
+        "max_iter": 70000,
     },
     optimizer_kwargs={}
 )
 
 # no big difference for unitary evolution
 
-initial = qt.qeye(2) # Identity
-target  = qt.gates.hadamard_transform()
+#initial = qt.qeye(2) # Identity
+#target  = qt.gates.hadamard_transform()
 
-unitary_rl = state2state_rl._replace(
-    objectives=[Objective(initial, H, target)],
-)
+#unitary_rl = state2state_rl._replace(
+#    objectives=[Objective(initial, H, target)],
+#)
 
 
 @pytest.fixture(
@@ -199,8 +199,8 @@ def sin_z_jax(t, r, **kwargs):
         #pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
         #pytest.param(state2state_goat, id="State to state (GOAT)"),
         #pytest.param(state2state_jax, id="State to state (JAX)"),
-        #pytest.param(state2state_rl, id="State to state (RL)"),
-        pytest.param(unitary_rl, id="Unitary (RL)"),
+        pytest.param(state2state_rl, id="State to state (RL)"),
+        #pytest.param(unitary_rl, id="Unitary (RL)"),
     ]
 )
 def tst(request):

From f2f1d279db9654c81131a878cf06381fba9a8d52 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Thu, 22 Aug 2024 16:34:35 +0200
Subject: [PATCH 13/52] added RL algorithm

---
 doc/guide/guide-control.rst | 39 +++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/doc/guide/guide-control.rst b/doc/guide/guide-control.rst
index b2029cd..6a0f9aa 100644
--- a/doc/guide/guide-control.rst
+++ b/doc/guide/guide-control.rst
@@ -195,6 +195,45 @@ experimental systematic noise, ...) can be done all in one, using this
 algorithm.
 
 
+The RL Algorithm
+================
+Reinforcement Learning (RL) represents a different approach compared to traditional
+quantum control methods, such as GRAPE and CRAB. Instead of relying on gradients or
+prior knowledge of the system, RL uses an agent that autonomously learns to optimize
+control policies by interacting with the quantum environment.
+
+The RL algorithm consists of three main components:
+**Agent**: The RL agent is responsible for making decisions regarding control
+parameters at each time step. The agent observes the current state of the quantum
+system and chooses an action (i.e., a set of control parameters) based on the current policy.
+**Environment**: The environment represents the quantum system that evolves over time.
+The environment is defined by the system's dynamics, which include drift and control Hamiltonians.
+Each action chosen by the agent induces a response in the environment, which manifests as an
+evolution of the system's state. From this, a reward can be derived.
+**Reward**: The reward is a measure of how much the action chosen by the agent brings the
+quantum system closer to the desired objective. In this context, the objective could be the
+preparation of a specific state, state-to-state transfer, or the synthesis of a quantum gate.
+
+Each interaction between the agent and the environment defines a step.
+A sequence of steps forms an episode.The episode ends when certain conditions, such as reaching
+a specific fidelity, are met.
+The reward function is a crucial component of the RL algorithm. It must be designed to
+accurately reflect the objective of the quantum control problem.
+The algorithm will aim to update its policy to maximize the reward obtained during the
+various episodes of training. This highlights the importance of ensuring that the control
+problem's objectives are well encoded in the reward function. For example, in a state-to-state
+transfer problem, the reward might be based on the fidelity between the final state obtained
+and the desired target state. A common choice is:
+.. math:: R(s, a) = 1 - \text{infidelity}(s_{\text{final}}, s_{\text{target}}) - \text{step penalty}
+Here, the step penalty is a small negative value that encourages the agent to reach the objective
+in as few steps as possible.
+
+In QuTiP, the RL environment is modeled as a custom class derived from the gymnasium library.
+This class allows defining the quantum system's dynamics at each step, the actions the agent
+can take, the observation space, and so on. The RL agent can be trained using pre-existing
+policies such as Proximal Policy Optimization (PPO) from the stable_baselines3 library.
+
+
 Optimal Quantum Control in QuTiP
 ================================
 Defining a control problem with QuTiP is very easy.

From 1fd96626deedf74f9cb6b5bd08b86fd715234496 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Thu, 22 Aug 2024 16:50:45 +0200
Subject: [PATCH 14/52] rendering update

---
 doc/guide/guide-control.rst | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/doc/guide/guide-control.rst b/doc/guide/guide-control.rst
index 6a0f9aa..2976fac 100644
--- a/doc/guide/guide-control.rst
+++ b/doc/guide/guide-control.rst
@@ -203,14 +203,15 @@ prior knowledge of the system, RL uses an agent that autonomously learns to opti
 control policies by interacting with the quantum environment.
 
 The RL algorithm consists of three main components:
-**Agent**: The RL agent is responsible for making decisions regarding control
+
+- **Agent**: The RL agent is responsible for making decisions regarding control
 parameters at each time step. The agent observes the current state of the quantum
 system and chooses an action (i.e., a set of control parameters) based on the current policy.
-**Environment**: The environment represents the quantum system that evolves over time.
+- **Environment**: The environment represents the quantum system that evolves over time.
 The environment is defined by the system's dynamics, which include drift and control Hamiltonians.
 Each action chosen by the agent induces a response in the environment, which manifests as an
 evolution of the system's state. From this, a reward can be derived.
-**Reward**: The reward is a measure of how much the action chosen by the agent brings the
+- **Reward**: The reward is a measure of how much the action chosen by the agent brings the
 quantum system closer to the desired objective. In this context, the objective could be the
 preparation of a specific state, state-to-state transfer, or the synthesis of a quantum gate.
 
@@ -224,7 +225,10 @@ various episodes of training. This highlights the importance of ensuring that th
 problem's objectives are well encoded in the reward function. For example, in a state-to-state
 transfer problem, the reward might be based on the fidelity between the final state obtained
 and the desired target state. A common choice is:
-.. math:: R(s, a) = 1 - \text{infidelity}(s_{\text{final}}, s_{\text{target}}) - \text{step penalty}
+.. math:: 
+  
+  R(s, a) = 1 - \text{infidelity}(s_{\text{final}}, s_{\text{target}}) - \text{step penalty}
+
 Here, the step penalty is a small negative value that encourages the agent to reach the objective
 in as few steps as possible.
 

From 0d2df6f701f9dd05187c9d30147f339bc3f738c7 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Thu, 22 Aug 2024 17:00:17 +0200
Subject: [PATCH 15/52] rendering update

---
 doc/guide/guide-control.rst | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/doc/guide/guide-control.rst b/doc/guide/guide-control.rst
index 2976fac..4212899 100644
--- a/doc/guide/guide-control.rst
+++ b/doc/guide/guide-control.rst
@@ -204,14 +204,14 @@ control policies by interacting with the quantum environment.
 
 The RL algorithm consists of three main components:
 
-- **Agent**: The RL agent is responsible for making decisions regarding control
+**Agent**: The RL agent is responsible for making decisions regarding control
 parameters at each time step. The agent observes the current state of the quantum
 system and chooses an action (i.e., a set of control parameters) based on the current policy.
-- **Environment**: The environment represents the quantum system that evolves over time.
+**Environment**: The environment represents the quantum system that evolves over time.
 The environment is defined by the system's dynamics, which include drift and control Hamiltonians.
 Each action chosen by the agent induces a response in the environment, which manifests as an
 evolution of the system's state. From this, a reward can be derived.
-- **Reward**: The reward is a measure of how much the action chosen by the agent brings the
+**Reward**: The reward is a measure of how much the action chosen by the agent brings the
 quantum system closer to the desired objective. In this context, the objective could be the
 preparation of a specific state, state-to-state transfer, or the synthesis of a quantum gate.
 
@@ -223,14 +223,9 @@ accurately reflect the objective of the quantum control problem.
 The algorithm will aim to update its policy to maximize the reward obtained during the
 various episodes of training. This highlights the importance of ensuring that the control
 problem's objectives are well encoded in the reward function. For example, in a state-to-state
-transfer problem, the reward might be based on the fidelity between the final state obtained
-and the desired target state. A common choice is:
-.. math:: 
-  
-  R(s, a) = 1 - \text{infidelity}(s_{\text{final}}, s_{\text{target}}) - \text{step penalty}
-
-Here, the step penalty is a small negative value that encourages the agent to reach the objective
-in as few steps as possible.
+transfer problem, the reward could be based on the fidelity between the achieved final state
+and the desired target state and subtract a constant penalty term.
+The step penalty is a small value that encourages the agent to reach the objective in as few steps as possible.
 
 In QuTiP, the RL environment is modeled as a custom class derived from the gymnasium library.
 This class allows defining the quantum system's dynamics at each step, the actions the agent

From fadb42e52f9008e2f9846ced4686f77e29c44d04 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Thu, 22 Aug 2024 17:56:25 +0200
Subject: [PATCH 16/52] update docstring

---
 src/qutip_qoc/pulse_optim.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 573a51d..efc5d84 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -25,7 +25,7 @@ def optimize_pulses(
     integrator_kwargs=None,
 ):
     """
-    Run GOAT, JOPT, GRAPE or CRAB optimization.
+    Run GOAT, JOPT, GRAPE, CRAB or RL optimization.
 
     Parameters
     ----------
@@ -37,6 +37,7 @@ def optimize_pulses(
         Dictionary of options for the control pulse optimization.
         The keys of this dict must be a unique string identifier for each control Hamiltonian / function.
         For the GOAT and JOPT algorithms, the dict may optionally also contain the key "__time__".
+        For RL you don't need to specify the guess.
         For each control function it must specify:
 
             control_id : dict
@@ -71,14 +72,15 @@ def optimize_pulses(
 
             - alg : str
                 Algorithm to use for the optimization.
-                Supported are: "GRAPE", "CRAB", "GOAT", "JOPT".
+                Supported are: "GRAPE", "CRAB", "GOAT", "JOPT" and "RL".
 
             - fid_err_targ : float, optional
                 Fidelity error target for the optimization.
 
             - max_iter : int, optional
                 Maximum number of iterations to perform.
-                Referes to local minimizer steps.
+                Referes to local minimizer steps or in the context of 
+                `alg: "RL"` to the max. number of episodes.
                 Global steps default to 0 (no global optimization).
                 Can be overridden by specifying in minimizer_kwargs.
 

From 0c6840581c9dde925efcc283015129f946d8dd66 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Thu, 22 Aug 2024 18:00:09 +0200
Subject: [PATCH 17/52] update docstring

---
 src/qutip_qoc/pulse_optim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index efc5d84..662fb67 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -1,7 +1,7 @@
 """
 This module is the entry point for the optimization of control pulses.
 It provides the function `optimize_pulses` which prepares and runs the
-GOAT, JOPT, GRAPE or CRAB optimization.
+GOAT, JOPT, GRAPE, CRAB or RL optimization.
 """
 import numpy as np
 

From e66da0ffd63d65fb9fdd4b0d56071c47440c0d7c Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sat, 24 Aug 2024 17:32:49 +0200
Subject: [PATCH 18/52] Added shorter_pulses to algorithm_kwargs and changed
 callback to stop train early, removed update_solver(), use of var_time = True
 supported, other minor fixes

---
 src/qutip_qoc/_rl.py         | 77 ++++++++++++++++--------------------
 src/qutip_qoc/pulse_optim.py | 12 ++++--
 tests/test_result.py         | 28 ++++++-------
 3 files changed, 55 insertions(+), 62 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 943bcd3..ce4ae74 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -47,8 +47,6 @@ def __init__(
         super(_RL,self).__init__()
         
         self._Hd_lst, self._Hc_lst = [], []
-        if not isinstance(objectives, list):
-            objectives = [objectives]
         for objective in objectives:
             # extract drift and control Hamiltonians from the objective
             self._Hd_lst.append(objective.H[0])
@@ -61,8 +59,7 @@ def __init__(
             self._H_lst.append([Hc, lambda t, args: self.pulse(t, self.args, i+1)])
         self._H = qt.QobjEvo(self._H_lst, self.args)
 
-        self._control_parameters = control_parameters
-        # extract bounds for _control_parameters
+        # extract bounds for control_parameters
         bounds = []
         for key in control_parameters.keys():
             bounds.append(control_parameters[key].get("bounds"))
@@ -70,6 +67,7 @@ def __init__(
         self.ubound = [b[0][1] for b in bounds]
 
         self._alg_kwargs = alg_kwargs
+        self.shorter_pulses = self._alg_kwargs.get("shorter_pulses", False)     # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps
 
         self._initial = objectives[0].initial
         self._target = objectives[0].target
@@ -82,7 +80,7 @@ def __init__(
             start_local_time = time.localtime(),    # initial optimization time
             n_iters = 0,                            # Number of iterations(episodes) until convergence 
             iter_seconds = [],                      # list containing the time taken for each iteration(episode) of the optimization
-            var_time = False,                       # Whether the optimization was performed with variable time
+            var_time = True,                        # Whether the optimization was performed with variable time
         )
 
         #for the reward
@@ -123,12 +121,7 @@ def __init__(
         self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)     # Continuous action space from -1 to +1, as suggested from gym
         self.observation_space = spaces.Box(low=-1, high=1, shape=obs_shape, dtype=np.float32)              # Observation space
         
-    def update_solver(self): 
-        """
-        Update the solver and fidelity type based on the problem setup.
-        Chooses the appropriate solver (Schrödinger or master equation) and 
-        prepares for infidelity calculation.
-        """
+        # create the solver
         if self._Hd_lst[0].issuper:
             self._fid_type = self._alg_kwargs.get("fid_type", "TRACEDIFF")
             self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
@@ -136,8 +129,6 @@ def update_solver(self):
             self._fid_type = self._alg_kwargs.get("fid_type", "PSU")
             self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
 
-        self.infidelity = self._infid
-
     def pulse(self, t, args, idx):
         """
         Returns the control pulse value at time t for a given index.
@@ -153,7 +144,8 @@ def save_episode_info(self):
             "final_infidelity": self._result.infidelity,
             "terminated": self.terminated,
             "truncated": self.truncated,
-            "steps_used": self.current_step
+            "steps_used": self.current_step,
+            "elapsed_time": time.mktime(time.localtime())
         }
         self.episode_info.append(episode_data)
 
@@ -189,10 +181,8 @@ def step(self, action):
 
         for i, value in enumerate(alphas):
             self.args[f"alpha{i+1}"] = value
-        self._H = qt.QobjEvo(self._H_lst, self.args)
 
-        self.update_solver()                # _H has changed
-        infidelity = self.infidelity()
+        infidelity = self._infid()
 
         self.current_step += 1
         self.temp_actions.append(alphas)
@@ -220,7 +210,7 @@ def reset(self, seed=None):
         """
         self.save_episode_info()
 
-        time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
+        time_diff = self.episode_info[-1]["elapsed_time"] - (self.episode_info[-2]["elapsed_time"] if len(self.episode_info) > 1 else time.mktime(self._result.start_local_time))
         self._result.iter_seconds.append(time_diff)
         self.current_step = 0                                           # Reset the step counter
         self.current_episode += 1                                       # Increment episode counter
@@ -238,7 +228,7 @@ def result(self):
         """
         self._result.end_local_time = time.localtime()
         self._result.n_iters = len(self._result.iter_seconds)  
-        self._result.optimized_params = self.actions.copy()
+        self._result.optimized_params = self.actions.copy() + [self._result.total_seconds]                    # If var_time is True, the last parameter is the evolution time
         self._result._optimized_controls = self.actions.copy()
         self._result._final_states = (self._result._final_states if self._result._final_states is not None else []) + [self.state]
         self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
@@ -274,11 +264,21 @@ def __init__(self, verbose: int = 0):
 
     def _on_step(self) -> bool:
         """
-        This method is required by the BaseCallback class. We use it only to stop the training.
+        This method is required by the BaseCallback class. We use it to stop the training.
+        - Stop training if the maximum number of episodes is reached.
+        - Stop training if it finds an episode with infidelity <= than target infidelity
         """
+        env = self.training_env.envs[0].unwrapped
+
         # Check if we need to stop training
         if self.stop_train:
             return False  # Stop training
+        elif env.current_episode >= env.max_episodes:
+            env._result.message = f"Reached {env.max_episodes} episodes, stopping training."
+            return False    # Stop training
+        elif (env._result.infidelity <= env._fid_err_targ) and not(env.shorter_pulses):
+            env._result.message = f"Stop training because an episode with infidelity <= target infidelity was found"
+            return False  # Stop training
         return True  # Continue training
 
     def _on_rollout_start(self) -> None:
@@ -286,28 +286,19 @@ def _on_rollout_start(self) -> None:
         This method is called before the rollout starts (before collecting new samples).
         Checks:
         - If all of the last 100 episodes have infidelity below the target and use the same number of steps, stop training.
-        - Stop training if the maximum number of episodes is reached.
         """
+        #could be moved to on_step
+        
         env = self.training_env.envs[0].unwrapped
-        max_episodes = env.max_episodes
-        fid_err_targ = env._fid_err_targ
-
-        if len(env.episode_info) >= 100:
-            last_100_episodes = env.episode_info[-100:]
-
-            min_steps = min(info['steps_used'] for info in last_100_episodes)
-            steps_condition = all(ep['steps_used'] == min_steps for ep in last_100_episodes)
-            infid_condition = all(ep['final_infidelity'] <= fid_err_targ for ep in last_100_episodes)
-
-            if steps_condition and infid_condition:
-                env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
-                #print(f"Stopping training as no episode in the last 100 used fewer steps and infidelity was below target infid.")
-                self.stop_train = True  # Stop training
-                #print([ep['steps_used'] for ep in last_100_episodes])
-                #print([ep['final_infidelity'] for ep in last_100_episodes])
-
-        # Check max episodes condition
-        if env.current_episode >= max_episodes:
-            env._result.message = f"Reached {max_episodes} episodes, stopping training."
-            #print(f"Reached {max_episodes} episodes, stopping training.")
-            self.stop_train = True  # Stop training
+        #Only if specified in alg_kwargs, the algorithm will search for shorter pulses, resulting in episodes with fewer steps.
+        if env.shorter_pulses:
+            if len(env.episode_info) >= 100:
+                last_100_episodes = env.episode_info[-100:]
+
+                min_steps = min(info['steps_used'] for info in last_100_episodes)
+                steps_condition = all(ep['steps_used'] == min_steps for ep in last_100_episodes)
+                infid_condition = all(ep['final_infidelity'] <= env._fid_err_targ for ep in last_100_episodes)
+
+                if steps_condition and infid_condition:
+                    env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
+                    self.stop_train = True  # Stop training
\ No newline at end of file
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 662fb67..6964b9a 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -37,7 +37,6 @@ def optimize_pulses(
         Dictionary of options for the control pulse optimization.
         The keys of this dict must be a unique string identifier for each control Hamiltonian / function.
         For the GOAT and JOPT algorithms, the dict may optionally also contain the key "__time__".
-        For RL you don't need to specify the guess.
         For each control function it must specify:
 
             control_id : dict
@@ -46,6 +45,7 @@ def optimize_pulses(
                     where ``n`` is the number of independent variables.
 
                 - bounds : sequence, optional
+                    For RL you don't need to specify the guess.
                     Sequence of ``(min, max)`` pairs for each element in
                     `guess`. None is used to specify no bound.
 
@@ -84,6 +84,11 @@ def optimize_pulses(
                 Global steps default to 0 (no global optimization).
                 Can be overridden by specifying in minimizer_kwargs.
 
+            - shorter_pulses : bool, optional
+                If set to True, allows the algorithm to search for shorter control 
+                pulses that can achieve the desired fidelity target using fewer steps. 
+                By default, it is set to False, only attempting to reach the target infidelity.
+
         Algorithm specific keywords for GRAPE,CRAB can be found in
         :func:`qutip_qtrl.pulseoptim.optimize_pulse`.
 
@@ -154,7 +159,7 @@ def optimize_pulses(
     # extract guess and bounds for the control pulses
     x0, bounds = [], []
     for key in control_parameters.keys():
-        x0.append(control_parameters[key].get("guess"))         # TODO: for now only consider bounds
+        x0.append(control_parameters[key].get("guess"))
         bounds.append(control_parameters[key].get("bounds"))
     try:  # GRAPE, CRAB format
         lbound = [b[0][0] for b in bounds]
@@ -351,8 +356,7 @@ def optimize_pulses(
 
             qtrl_optimizers.append(qtrl_optimizer)
 
-    # TODO: we can deal with proper handling later
-    if alg == "RL":
+    elif alg == "RL":
         rl_env = _RL(
             objectives,
             control_parameters,
diff --git a/tests/test_result.py b/tests/test_result.py
index f2ff583..1ffa3df 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -153,8 +153,6 @@ def sin_z_jax(t, r, **kwargs):
 )
 
 # ----------------------- RL --------------------
-# TODO: this is the input for optimiz_pulses() function
-# you can use this routine to test your implementation
 
 # state to state transfer
 initial = qt.basis(2, 0)
@@ -169,7 +167,6 @@ def sin_z_jax(t, r, **kwargs):
 
 state2state_rl = Case(
     objectives=[Objective(initial, H, target)],
-    #control_parameters={"bounds": [-13, 13]}, # TODO: for now only consider bounds
     control_parameters = {
         "p": {"bounds": [(-13, 13)],}
     },
@@ -177,30 +174,31 @@ def sin_z_jax(t, r, **kwargs):
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
-        "max_iter": 70000,
+        "max_iter": 300,
+        "shorter_pulses": True,
     },
     optimizer_kwargs={}
 )
 
 # no big difference for unitary evolution
 
-#initial = qt.qeye(2) # Identity
-#target  = qt.gates.hadamard_transform()
+initial = qt.qeye(2) # Identity
+target  = qt.gates.hadamard_transform()
 
-#unitary_rl = state2state_rl._replace(
-#    objectives=[Objective(initial, H, target)],
-#)
+unitary_rl = state2state_rl._replace(
+    objectives=[Objective(initial, H, target)],
+)
 
 
 @pytest.fixture(
     params=[
-        #pytest.param(state2state_grape, id="State to state (GRAPE)"),
-        #pytest.param(state2state_crab, id="State to state (CRAB)"),
-        #pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
-        #pytest.param(state2state_goat, id="State to state (GOAT)"),
-        #pytest.param(state2state_jax, id="State to state (JAX)"),
+        pytest.param(state2state_grape, id="State to state (GRAPE)"),
+        pytest.param(state2state_crab, id="State to state (CRAB)"),
+        pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
+        pytest.param(state2state_goat, id="State to state (GOAT)"),
+        pytest.param(state2state_jax, id="State to state (JAX)"),
         pytest.param(state2state_rl, id="State to state (RL)"),
-        #pytest.param(unitary_rl, id="Unitary (RL)"),
+        pytest.param(unitary_rl, id="Unitary (RL)"),
     ]
 )
 def tst(request):

From 6cee77c3823e63ad8039710de6f1f9d1f66d2c03 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sat, 24 Aug 2024 19:07:37 +0200
Subject: [PATCH 19/52] Fixed final state saving

---
 src/qutip_qoc/_rl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index ce4ae74..ede6d5e 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -218,6 +218,7 @@ def reset(self, seed=None):
         self.terminated = False
         self.truncated = False
         self.temp_actions = []
+        self._result._final_states = [self.state]
         self.state = self._initial
         return self._get_obs(), {}
     
@@ -230,7 +231,6 @@ def result(self):
         self._result.n_iters = len(self._result.iter_seconds)  
         self._result.optimized_params = self.actions.copy() + [self._result.total_seconds]                    # If var_time is True, the last parameter is the evolution time
         self._result._optimized_controls = self.actions.copy()
-        self._result._final_states = (self._result._final_states if self._result._final_states is not None else []) + [self.state]
         self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
         self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
         self._result._guess_controls = []

From 9d6fa29727d0a4f53f4dcb5548c4d6c98ef460c7 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sun, 25 Aug 2024 14:06:13 +0200
Subject: [PATCH 20/52] small corrections

---
 doc/guide/guide-control.rst | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/doc/guide/guide-control.rst b/doc/guide/guide-control.rst
index 4212899..a469716 100644
--- a/doc/guide/guide-control.rst
+++ b/doc/guide/guide-control.rst
@@ -216,21 +216,20 @@ quantum system closer to the desired objective. In this context, the objective c
 preparation of a specific state, state-to-state transfer, or the synthesis of a quantum gate.
 
 Each interaction between the agent and the environment defines a step.
-A sequence of steps forms an episode.The episode ends when certain conditions, such as reaching
+A sequence of steps forms an episode. The episode ends when certain conditions, such as reaching
 a specific fidelity, are met.
-The reward function is a crucial component of the RL algorithm. It must be designed to
-accurately reflect the objective of the quantum control problem.
-The algorithm will aim to update its policy to maximize the reward obtained during the
-various episodes of training. This highlights the importance of ensuring that the control
-problem's objectives are well encoded in the reward function. For example, in a state-to-state
-transfer problem, the reward could be based on the fidelity between the achieved final state
-and the desired target state and subtract a constant penalty term.
+The reward function is a crucial component of the RL algorithm, carefully designed to
+reflect the objective of the quantum control problem.
+It guides the algorithm in updating its policy to maximize the reward obtained during the various
+training episodes.
+For example, in a state-to-state transfer problem, the reward could be based on the fidelity
+between the achieved final state and the desired target state and subtract a constant penalty term.
 The step penalty is a small value that encourages the agent to reach the objective in as few steps as possible.
 
 In QuTiP, the RL environment is modeled as a custom class derived from the gymnasium library.
 This class allows defining the quantum system's dynamics at each step, the actions the agent
-can take, the observation space, and so on. The RL agent can be trained using pre-existing
-policies such as Proximal Policy Optimization (PPO) from the stable_baselines3 library.
+can take, the observation space, and so on. The RL agent is trained using the Proximal Policy Optimization
+(PPO) algorithm from the stable baselines3 library.
 
 
 Optimal Quantum Control in QuTiP

From 318081f1d34ac17e484622cce117290756a57f9b Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Mon, 26 Aug 2024 10:28:31 +0200
Subject: [PATCH 21/52] edit configuration files

---
 .pre-commit-config.yaml | 2 ++
 requirements.txt        | 2 ++
 setup.cfg               | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0bcf23a..0c027ee 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -92,3 +92,5 @@ repos:
           - jaxlib
           - diffrax
           - pytest
+          - gymnasium
+          - stable-baselines3
diff --git a/requirements.txt b/requirements.txt
index a08565e..9687463 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,5 @@ qutip>=5.0.1
 qutip-qtrl
 qutip-jax
 pre-commit
+gymnasium>=0.29.1
+stable-baselines3>=2.3.2
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 3903195..7872023 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,6 +35,8 @@ install_requires =
     qutip-qtrl
     qutip-jax
     numpy>=1.16.6,<2.0
+    gymnasium>=0.29.1
+    stable-baselines3>=2.3.2
 setup_requires =
     cython>=1.0
     packaging

From 54038092d56fab144d6282466856858387a6b668 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Mon, 9 Sep 2024 13:29:42 +0200
Subject: [PATCH 22/52] Minor text fixes

---
 doc/guide/guide-control.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/guide/guide-control.rst b/doc/guide/guide-control.rst
index a469716..915b6b6 100644
--- a/doc/guide/guide-control.rst
+++ b/doc/guide/guide-control.rst
@@ -222,9 +222,10 @@ The reward function is a crucial component of the RL algorithm, carefully design
 reflect the objective of the quantum control problem.
 It guides the algorithm in updating its policy to maximize the reward obtained during the various
 training episodes.
-For example, in a state-to-state transfer problem, the reward could be based on the fidelity
-between the achieved final state and the desired target state and subtract a constant penalty term.
-The step penalty is a small value that encourages the agent to reach the objective in as few steps as possible.
+For example, in a state-to-state transfer problem, the reward is based on the fidelity
+between the achieved final state and the desired target state.
+In addition, a constant penalty term is subtracted in order to encourages the agent to reach the
+objective in as few steps as possible.
 
 In QuTiP, the RL environment is modeled as a custom class derived from the gymnasium library.
 This class allows defining the quantum system's dynamics at each step, the actions the agent

From 4485f9d42685364962ac4b436d4810200e2e1a26 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sat, 14 Sep 2024 20:17:11 +0200
Subject: [PATCH 23/52] Docstring fixes, use of __time__ parameter instead of
 shorter_pulses, added underscore for internal variables, args is now passed
 as a parameter to _infid(), changes in callback function

---
 src/qutip_qoc/_rl.py         | 126 +++++++++++++++++------------------
 src/qutip_qoc/pulse_optim.py |  10 +--
 tests/test_result.py         |   9 ++-
 3 files changed, 69 insertions(+), 76 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index ede6d5e..66b8474 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -52,27 +52,34 @@ def __init__(
             self._Hd_lst.append(objective.H[0])
             self._Hc_lst.append([H[0] if isinstance(H, list) else H for H in objective.H[1:]])
 
-        # create the QobjEvo with Hd, Hc and controls(args)
-        self.args = {f"alpha{i+1}": (1) for i in range(len(self._Hc_lst[0]))}    # set the control parameters to 1 for all the Hc
+        def create_pulse_func(idx):
+            """
+            Create a control pulse lambda function for a given index.
+            """
+            return lambda t, args: self._pulse(t, args, idx+1)
+        
+        # create the QobjEvo with Hd, Hc and controls(args)        
         self._H_lst = [self._Hd_lst[0]]
+        dummy_args = {f'alpha{i+1}': 1.0 for i in range(len(self._Hc_lst[0]))}
         for i, Hc in enumerate(self._Hc_lst[0]):
-            self._H_lst.append([Hc, lambda t, args: self.pulse(t, self.args, i+1)])
-        self._H = qt.QobjEvo(self._H_lst, self.args)
+            self._H_lst.append([Hc, create_pulse_func(i)])
+        self._H = qt.QobjEvo(self._H_lst, args=dummy_args)
+
+        self.shorter_pulses = False if time_options == {} else True                 # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps           
 
         # extract bounds for control_parameters
         bounds = []
         for key in control_parameters.keys():
             bounds.append(control_parameters[key].get("bounds"))
-        self.lbound = [b[0][0] for b in bounds]
-        self.ubound = [b[0][1] for b in bounds]
+        self._lbound = [b[0][0] for b in bounds]
+        self._ubound = [b[0][1] for b in bounds]
 
         self._alg_kwargs = alg_kwargs
-        self.shorter_pulses = self._alg_kwargs.get("shorter_pulses", False)     # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps
 
         self._initial = objectives[0].initial
         self._target = objectives[0].target
-        self.state = None
-        self.dim = self._initial.shape[0]
+        self._state = None
+        self._dim = self._initial.shape[0]
 
         self._result = Result(
             objectives = objectives,
@@ -87,19 +94,19 @@ def __init__(
         self._step_penalty = 1
 
         # To check if it exceeds the maximum number of steps in an episode
-        self.current_step = 0
+        self._current_step = 0
 
         self.terminated = False
         self.truncated = False
-        self.episode_info = []                # to contain some information from the latest episode
+        self._episode_info = []                # to contain some information from the latest episode
 
         self._fid_err_targ = alg_kwargs["fid_err_targ"]
 
         # inferred attributes
         self._norm_fac = 1 / self._target.norm()
 
-        self.temp_actions = []                  # temporary list to save episode actions
-        self.actions = []                       # list of actions(lists) of the last episode
+        self._temp_actions = []                  # temporary list to save episode actions
+        self._actions = []                       # list of actions(lists) of the last episode
 
         # integrator options
         self._integrator_kwargs = integrator_kwargs
@@ -108,16 +115,16 @@ def __init__(
 
         self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
         self.max_steps = time_interval.n_tslots                         # maximum number of steps in an episode
-        self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
+        self._step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
         self.max_episodes = alg_kwargs["max_iter"]                      # maximum number of episodes for training
-        self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
+        self._total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
         self.current_episode = 0                                        # To keep track of the current episode
         
         # Define action and observation spaces (Gym)
         if self._initial.isket:
-            obs_shape = (2 * self.dim,)
+            obs_shape = (2 * self._dim,)
         else:   # for unitary operators 
-            obs_shape = (2 * self.dim * self.dim,)
+            obs_shape = (2 * self._dim * self._dim,)
         self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)     # Continuous action space from -1 to +1, as suggested from gym
         self.observation_space = spaces.Box(low=-1, high=1, shape=obs_shape, dtype=np.float32)              # Observation space
         
@@ -129,13 +136,14 @@ def __init__(
             self._fid_type = self._alg_kwargs.get("fid_type", "PSU")
             self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
 
-    def pulse(self, t, args, idx):
+    def _pulse(self, t, args, idx):
         """
         Returns the control pulse value at time t for a given index.
         """
-        return 1*args[f"alpha{idx}"]
+        alpha = args[f"alpha{idx}"]
+        return alpha
     
-    def save_episode_info(self):
+    def _save_episode_info(self):
         """
         Save the information of the last episode before resetting the environment.
         """
@@ -144,19 +152,19 @@ def save_episode_info(self):
             "final_infidelity": self._result.infidelity,
             "terminated": self.terminated,
             "truncated": self.truncated,
-            "steps_used": self.current_step,
+            "steps_used": self._current_step,
             "elapsed_time": time.mktime(time.localtime())
         }
-        self.episode_info.append(episode_data)
+        self._episode_info.append(episode_data)
 
-    def _infid(self, params=None):
+    def _infid(self, args):
         """
         The agent performs a step, then calculate infidelity to be minimized of the current state against the target state.
         """
         X = self._solver.run(
-            self.state, [0.0, self.step_duration], args={"p": params}
+            self._state, [0.0, self._step_duration], args=args
         ).final_state
-        self.state = X
+        self._state = X
 
         if self._fid_type == "TRACEDIFF":
             diff = X - self._target
@@ -177,20 +185,18 @@ def step(self, action):
         Perform a single time step in the environment, applying the scaled action (control pulse) 
         chosen by the RL agent. Updates the system's state and computes the reward.
         """
-        alphas = [((action[i] + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0] for i in range(len(action))] 
-
-        for i, value in enumerate(alphas):
-            self.args[f"alpha{i+1}"] = value
+        alphas = [((action[i] + 1) / 2 * (self._ubound[0] - self._lbound[0])) + self._lbound[0] for i in range(len(action))] 
 
-        infidelity = self._infid()
+        args = {f"alpha{i+1}": value for i, value in enumerate(alphas)}
+        _infidelity = self._infid(args)
 
-        self.current_step += 1
-        self.temp_actions.append(alphas)
-        self._result.infidelity = infidelity
-        reward = (1 - infidelity) - self._step_penalty
+        self._current_step += 1
+        self._temp_actions.append(alphas)
+        self._result.infidelity = _infidelity
+        reward = (1 - _infidelity) - self._step_penalty
 
-        self.terminated = infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
-        self.truncated = self.current_step >= self.max_steps                     # if the episode ended without reaching the goal
+        self.terminated = _infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
+        self.truncated = self._current_step >= self.max_steps                     # if the episode ended without reaching the goal
 
         observation = self._get_obs()
         return observation, reward, bool(self.terminated), bool(self.truncated), {}
@@ -200,7 +206,7 @@ def _get_obs(self):
         Get the current state observation for the RL agent. Converts the system's 
         quantum state or matrix into a real-valued NumPy array suitable for RL algorithms.
         """
-        rho = self.state.full().flatten()
+        rho = self._state.full().flatten()
         obs = np.concatenate((np.real(rho), np.imag(rho)))
         return obs.astype(np.float32)                                       # Gymnasium expects the observation to be of type float32
     
@@ -208,18 +214,18 @@ def reset(self, seed=None):
         """
         Reset the environment to the initial state, preparing for a new episode.
         """
-        self.save_episode_info()
+        self._save_episode_info()
 
-        time_diff = self.episode_info[-1]["elapsed_time"] - (self.episode_info[-2]["elapsed_time"] if len(self.episode_info) > 1 else time.mktime(self._result.start_local_time))
+        time_diff = self._episode_info[-1]["elapsed_time"] - (self._episode_info[-2]["elapsed_time"] if len(self._episode_info) > 1 else time.mktime(self._result.start_local_time))
         self._result.iter_seconds.append(time_diff)
-        self.current_step = 0                                           # Reset the step counter
+        self._current_step = 0                                           # Reset the step counter
         self.current_episode += 1                                       # Increment episode counter
-        self.actions = self.temp_actions.copy()
+        self._actions = self._temp_actions.copy()
         self.terminated = False
         self.truncated = False
-        self.temp_actions = []
-        self._result._final_states = [self.state]
-        self.state = self._initial
+        self._temp_actions = []
+        self._result._final_states = [self._state]
+        self._state = self._initial
         return self._get_obs(), {}
     
     def result(self):
@@ -229,8 +235,8 @@ def result(self):
         """
         self._result.end_local_time = time.localtime()
         self._result.n_iters = len(self._result.iter_seconds)  
-        self._result.optimized_params = self.actions.copy() + [self._result.total_seconds]                    # If var_time is True, the last parameter is the evolution time
-        self._result._optimized_controls = self.actions.copy()
+        self._result.optimized_params = self._actions.copy() + [self._result.total_seconds]                    # If var_time is True, the last parameter is the evolution time
+        self._result._optimized_controls = self._actions.copy()
         self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
         self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
         self._result._guess_controls = []
@@ -252,7 +258,7 @@ def train(self):
         stop_callback = EarlyStopTraining(verbose=1)
         
         # Train the model
-        model.learn(total_timesteps = self.total_timesteps, callback=stop_callback)
+        model.learn(total_timesteps = self._total_timesteps, callback=stop_callback)
 
 class EarlyStopTraining(BaseCallback):
     """
@@ -267,33 +273,20 @@ def _on_step(self) -> bool:
         This method is required by the BaseCallback class. We use it to stop the training.
         - Stop training if the maximum number of episodes is reached.
         - Stop training if it finds an episode with infidelity <= than target infidelity
+        - If all of the last 100 episodes have infidelity below the target and use the same number of steps, stop training.
         """
         env = self.training_env.envs[0].unwrapped
 
         # Check if we need to stop training
-        if self.stop_train:
-            return False  # Stop training
-        elif env.current_episode >= env.max_episodes:
+        if env.current_episode >= env.max_episodes:
             env._result.message = f"Reached {env.max_episodes} episodes, stopping training."
             return False    # Stop training
         elif (env._result.infidelity <= env._fid_err_targ) and not(env.shorter_pulses):
             env._result.message = f"Stop training because an episode with infidelity <= target infidelity was found"
             return False  # Stop training
-        return True  # Continue training
-
-    def _on_rollout_start(self) -> None:
-        """
-        This method is called before the rollout starts (before collecting new samples).
-        Checks:
-        - If all of the last 100 episodes have infidelity below the target and use the same number of steps, stop training.
-        """
-        #could be moved to on_step
-        
-        env = self.training_env.envs[0].unwrapped
-        #Only if specified in alg_kwargs, the algorithm will search for shorter pulses, resulting in episodes with fewer steps.
-        if env.shorter_pulses:
-            if len(env.episode_info) >= 100:
-                last_100_episodes = env.episode_info[-100:]
+        elif env.shorter_pulses:
+            if len(env._episode_info) >= 100:
+                last_100_episodes = env._episode_info[-100:]
 
                 min_steps = min(info['steps_used'] for info in last_100_episodes)
                 steps_condition = all(ep['steps_used'] == min_steps for ep in last_100_episodes)
@@ -301,4 +294,5 @@ def _on_rollout_start(self) -> None:
 
                 if steps_condition and infid_condition:
                     env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
-                    self.stop_train = True  # Stop training
\ No newline at end of file
+                    return False  # Stop training
+        return True  # Continue training
\ No newline at end of file
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 6964b9a..8a1f4dc 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -41,16 +41,17 @@ def optimize_pulses(
 
             control_id : dict
                 - guess: ndarray, shape (n,)
+                    For RL you don't need to specify the guess.
                     Initial guess. Array of real elements of size (n,),
                     where ``n`` is the number of independent variables.
 
                 - bounds : sequence, optional
-                    For RL you don't need to specify the guess.
                     Sequence of ``(min, max)`` pairs for each element in
                     `guess`. None is used to specify no bound.
 
             __time__ : dict, optional
-                Only supported by GOAT and JOPT.
+                Only supported by GOAT, JOPT and RL.
+                For RL the values of guess and bounds are not relevant.
                 If given the pulse duration is treated as optimization parameter.
                 It must specify both:
 
@@ -84,11 +85,6 @@ def optimize_pulses(
                 Global steps default to 0 (no global optimization).
                 Can be overridden by specifying in minimizer_kwargs.
 
-            - shorter_pulses : bool, optional
-                If set to True, allows the algorithm to search for shorter control 
-                pulses that can achieve the desired fidelity target using fewer steps. 
-                By default, it is set to False, only attempting to reach the target infidelity.
-
         Algorithm specific keywords for GRAPE,CRAB can be found in
         :func:`qutip_qtrl.pulseoptim.optimize_pulse`.
 
diff --git a/tests/test_result.py b/tests/test_result.py
index 1ffa3df..b74a211 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -168,14 +168,17 @@ def sin_z_jax(t, r, **kwargs):
 state2state_rl = Case(
     objectives=[Objective(initial, H, target)],
     control_parameters = {
-        "p": {"bounds": [(-13, 13)],}
+        "p": {"bounds": [(-13, 13)]},
+        "__time__": {
+            "guess": np.array([0.0]),  #dummy value
+            "bounds": [(0.0, 0.0)]    #dummy value
+        }
     },
     tlist=np.linspace(0, 10, 100),
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
-        "max_iter": 300,
-        "shorter_pulses": True,
+        "max_iter": 20000,
     },
     optimizer_kwargs={}
 )

From 3889a63d688e11a67ce69b382a1049725e79972f Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Mon, 30 Sep 2024 17:17:00 +0200
Subject: [PATCH 24/52] added _backup_result to keep track of the result even
 if the algorithm continues to search for solutions with shorter pulses

---
 src/qutip_qoc/_rl.py | 64 ++++++++++++++++++++++++++++++++++----------
 tests/test_result.py | 12 +++++++++
 2 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 66b8474..3bdb1fc 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -88,7 +88,19 @@ def create_pulse_func(idx):
             n_iters = 0,                            # Number of iterations(episodes) until convergence 
             iter_seconds = [],                      # list containing the time taken for each iteration(episode) of the optimization
             var_time = True,                        # Whether the optimization was performed with variable time
+            guess_params=[]
         )
+      
+        self._backup_result = Result(                              # used as a backup in case the algorithm with shorter_pulses does not find an episode with infid<target_infid
+            objectives = objectives,
+            time_interval = time_interval,
+            start_local_time = time.localtime(),
+            n_iters = 0,
+            iter_seconds = [],
+            var_time = True,
+            guess_params=[]
+        )
+        self._use_backup_result = False                             # if true, use self._backup_result as the final optimization result
 
         #for the reward
         self._step_penalty = 1
@@ -228,21 +240,39 @@ def reset(self, seed=None):
         self._state = self._initial
         return self._get_obs(), {}
     
-    def result(self):
+    def _save_result(self):
         """
-        Retrieve the results of the optimization process, including the optimized 
+        Save the results of the optimization process, including the optimized 
         pulse sequences, final states, and performance metrics.
         """
-        self._result.end_local_time = time.localtime()
-        self._result.n_iters = len(self._result.iter_seconds)  
-        self._result.optimized_params = self._actions.copy() + [self._result.total_seconds]                    # If var_time is True, the last parameter is the evolution time
-        self._result._optimized_controls = self._actions.copy()
-        self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
-        self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
-        self._result._guess_controls = []
-        self._result._optimized_H = [self._H]
-        self._result.guess_params = []
-        return self._result
+        result_obj = self._backup_result if self._use_backup_result else self._result
+
+        if(self._use_backup_result):
+            self._backup_result.iter_seconds = self._result.iter_seconds.copy()
+            self._backup_result._final_states = self._result._final_states.copy()
+            self._backup_result.infidelity = self._result.infidelity
+
+        result_obj.end_local_time = time.localtime()
+        result_obj.n_iters = len(self._result.iter_seconds)  
+        result_obj.optimized_params = self._actions.copy() + [self._result.total_seconds]                      # If var_time is True, the last parameter is the evolution time
+        result_obj._optimized_controls = self._actions.copy()
+        result_obj._guess_controls = []
+        result_obj._optimized_H = [self._H]
+
+
+    def result(self):
+        """
+        Final conversions and return of optimization results
+        """            
+        if self._use_backup_result:
+            self._backup_result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._backup_result.start_local_time)       # Convert to a string
+            self._backup_result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._backup_result.end_local_time)           # Convert to a string
+            return self._backup_result
+        else:
+            self._save_result()
+            self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
+            self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
+            return self._result
 
     def train(self):
         """
@@ -266,7 +296,6 @@ class EarlyStopTraining(BaseCallback):
     """
     def __init__(self, verbose: int = 0):
         super(EarlyStopTraining, self).__init__(verbose)
-        self.stop_train = False
 
     def _on_step(self) -> bool:
         """
@@ -279,12 +308,18 @@ def _on_step(self) -> bool:
 
         # Check if we need to stop training
         if env.current_episode >= env.max_episodes:
-            env._result.message = f"Reached {env.max_episodes} episodes, stopping training."
+            if env._use_backup_result == True:
+                env._backup_result.message = f"Reached {env.max_episodes} episodes, stopping training. Return the last founded episode with infid < target_infid"
+            else:
+                env._result.message = f"Reached {env.max_episodes} episodes, stopping training."
             return False    # Stop training
         elif (env._result.infidelity <= env._fid_err_targ) and not(env.shorter_pulses):
             env._result.message = f"Stop training because an episode with infidelity <= target infidelity was found"
             return False  # Stop training
         elif env.shorter_pulses:
+            if(env._result.infidelity <= env._fid_err_targ):        # if it finds an episode with infidelity lower than target infidelity, I'll save it in the meantime
+                env._use_backup_result = True
+                env._save_result()
             if len(env._episode_info) >= 100:
                 last_100_episodes = env._episode_info[-100:]
 
@@ -293,6 +328,7 @@ def _on_step(self) -> bool:
                 infid_condition = all(ep['final_infidelity'] <= env._fid_err_targ for ep in last_100_episodes)
 
                 if steps_condition and infid_condition:
+                    env._use_backup_result = False
                     env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
                     return False  # Stop training
         return True  # Continue training
\ No newline at end of file
diff --git a/tests/test_result.py b/tests/test_result.py
index b74a211..9bb5496 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -190,6 +190,18 @@ def sin_z_jax(t, r, **kwargs):
 
 unitary_rl = state2state_rl._replace(
     objectives=[Objective(initial, H, target)],
+        control_parameters = {
+        "p": {"bounds": [(-13, 13)]},
+        "__time__": {
+            "guess": np.array([0.0]),  #dummy value
+            "bounds": [(0.0, 0.0)]    #dummy value
+        }
+    },
+    algorithm_kwargs={
+        "fid_err_targ": 0.01,
+        "alg": "RL",
+        "max_iter": 300,
+    },
 )
 
 
From cd618771614787538b50c9bb9c552086866594ce Mon Sep 17 00:00:00 2001
From: flowerthrower <flowerthrower@users.noreply.github.com>
Date: Tue, 1 Oct 2024 17:19:22 +0200
Subject: [PATCH 25/52] get kwarg instead of pop

---
 src/qutip_qoc/_goat.py       | 4 +++-
 src/qutip_qoc/_optimizer.py  | 3 ---
 src/qutip_qoc/pulse_optim.py | 7 ++++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/qutip_qoc/_goat.py b/src/qutip_qoc/_goat.py
index 007388c..617e660 100644
--- a/src/qutip_qoc/_goat.py
+++ b/src/qutip_qoc/_goat.py
@@ -65,7 +65,9 @@ def __init__(
         self._var_t = "guess" in time_options
 
         # num of params for each control function
-        self._para_counts = [len(v["guess"]) for v in control_parameters.values()]
+        self._para_counts = [
+            len(v["guess"]) for k, v in control_parameters.items() if k != "__time__"
+        ]
 
         # inferred attributes
         self._tot_n_para = sum(self._para_counts)  # excl. time
diff --git a/src/qutip_qoc/_optimizer.py b/src/qutip_qoc/_optimizer.py
index e5b12f5..8218d51 100644
--- a/src/qutip_qoc/_optimizer.py
+++ b/src/qutip_qoc/_optimizer.py
@@ -283,9 +283,6 @@ def _global_local_optimization(
         _get_init_and_bounds_from_options(x0, control_parameters[key].get("guess"))
         _get_init_and_bounds_from_options(bounds, control_parameters[key].get("bounds"))
 
-    _get_init_and_bounds_from_options(x0, time_options.get("guess", None))
-    _get_init_and_bounds_from_options(bounds, time_options.get("bounds", None))
-
     optimizer_kwargs["x0"] = np.concatenate(x0)
 
     multi_objective = _MultiObjective(
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 45866be..6b2c7f8 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -133,7 +133,7 @@ def optimize_pulses(
     # create time interval
     time_interval = _TimeInterval(tslots=tlist)
 
-    time_options = control_parameters.pop("__time__", {})
+    time_options = control_parameters.get("__time__", {})
     if time_options:  # convert to list of bounds if not already
         if not isinstance(time_options["bounds"][0], (list, tuple)):
             time_options["bounds"] = [time_options["bounds"]]
@@ -151,8 +151,9 @@ def optimize_pulses(
     # extract guess and bounds for the control pulses
     x0, bounds = [], []
     for key in control_parameters.keys():
-        x0.append(control_parameters[key].get("guess"))
-        bounds.append(control_parameters[key].get("bounds"))
+        if key != "__time__":
+            x0.append(control_parameters[key].get("guess"))
+            bounds.append(control_parameters[key].get("bounds"))
     try:  # GRAPE, CRAB format
         lbound = [b[0][0] for b in bounds]
         ubound = [b[0][1] for b in bounds]

From ceb1de2c47b6ccefc1ae1b7f40937d2f65a0227a Mon Sep 17 00:00:00 2001
From: flowerthrower <flowerthrower@users.noreply.github.com>
Date: Wed, 2 Oct 2024 10:45:59 +0200
Subject: [PATCH 26/52] automatically convert to superoperator

---
 src/qutip_qoc/objective.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/qutip_qoc/objective.py b/src/qutip_qoc/objective.py
index 986961a..d91caac 100644
--- a/src/qutip_qoc/objective.py
+++ b/src/qutip_qoc/objective.py
@@ -80,6 +80,12 @@ def __init__(self, initial, H, target, weight=1):
         self.target = target
         self.weight = weight
 
+        # Check if any Hamiltonian in H is a superoperator
+        if any(qt.issuper(H_i) for H_i in (H if isinstance(H, list) else [H])):
+            # Convert initial and target accordingly
+            self.initial = qt.to_super(self.initial)
+            self.target = qt.to_super(self.target)
+
     def __getstate__(self):
         """
         Extract picklable information from the objective.

From 8fe527ba5affa1a544546fb526cb91560b7ed418 Mon Sep 17 00:00:00 2001
From: flowerthrower <flowerthrower@users.noreply.github.com>
Date: Wed, 2 Oct 2024 11:25:51 +0200
Subject: [PATCH 27/52] back to 'shorter_pulses'

---
 src/qutip_qoc/_rl.py         |  2 +-
 src/qutip_qoc/pulse_optim.py |  3 +--
 tests/test_result.py         | 10 ++--------
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 3bdb1fc..cdcaa01 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -65,7 +65,7 @@ def create_pulse_func(idx):
             self._H_lst.append([Hc, create_pulse_func(i)])
         self._H = qt.QobjEvo(self._H_lst, args=dummy_args)
 
-        self.shorter_pulses = False if time_options == {} else True                 # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps           
+        self.shorter_pulses = alg_kwargs.get("shorter_pulses", False) # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps           
 
         # extract bounds for control_parameters
         bounds = []
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 8a1f4dc..d320799 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -50,8 +50,7 @@ def optimize_pulses(
                     `guess`. None is used to specify no bound.
 
             __time__ : dict, optional
-                Only supported by GOAT, JOPT and RL.
-                For RL the values of guess and bounds are not relevant.
+                Only supported by GOAT, JOPT (for RL use `algorithm_kwargs: 'shorter_pulses'`).
                 If given the pulse duration is treated as optimization parameter.
                 It must specify both:
 
diff --git a/tests/test_result.py b/tests/test_result.py
index 9bb5496..7b9cda3 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -169,16 +169,13 @@ def sin_z_jax(t, r, **kwargs):
     objectives=[Objective(initial, H, target)],
     control_parameters = {
         "p": {"bounds": [(-13, 13)]},
-        "__time__": {
-            "guess": np.array([0.0]),  #dummy value
-            "bounds": [(0.0, 0.0)]    #dummy value
-        }
     },
     tlist=np.linspace(0, 10, 100),
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
         "max_iter": 20000,
+        "shorter_pulses": True,
     },
     optimizer_kwargs={}
 )
@@ -192,15 +189,12 @@ def sin_z_jax(t, r, **kwargs):
     objectives=[Objective(initial, H, target)],
         control_parameters = {
         "p": {"bounds": [(-13, 13)]},
-        "__time__": {
-            "guess": np.array([0.0]),  #dummy value
-            "bounds": [(0.0, 0.0)]    #dummy value
-        }
     },
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
         "max_iter": 300,
+        "shorter_pulses": True,
     },
 )
 

From 604b12e2f827d82f8823cc4c141cd0e247ea27d1 Mon Sep 17 00:00:00 2001
From: flowerthrower <flowerthrower@users.noreply.github.com>
Date: Wed, 2 Oct 2024 11:37:49 +0200
Subject: [PATCH 28/52] pre-commit formating

---
 requirements.txt             |   2 +-
 src/qutip_qoc/_rl.py         | 227 +++++++++++++++++++++--------------
 src/qutip_qoc/pulse_optim.py |   2 +-
 tests/test_result.py         |  18 +--
 4 files changed, 150 insertions(+), 99 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9687463..50da21d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,4 @@ qutip-qtrl
 qutip-jax
 pre-commit
 gymnasium>=0.29.1
-stable-baselines3>=2.3.2
\ No newline at end of file
+stable-baselines3>=2.3.2
diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index cdcaa01..9a75a55 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -1,10 +1,10 @@
 """
-This module contains functions that implement quantum optimal control 
-using reinforcement learning (RL) techniques, allowing for the optimization 
+This module contains functions that implement quantum optimal control
+using reinforcement learning (RL) techniques, allowing for the optimization
 of control pulse sequences in quantum systems.
 """
 import qutip as qt
-from qutip import Qobj, QobjEvo
+from qutip import Qobj
 from qutip_qoc import Result
 
 import numpy as np
@@ -17,12 +17,13 @@
 
 import time
 
+
 class _RL(gym.Env):
     """
-    Class for storing a control problem and implementing quantum optimal 
-    control using reinforcement learning. This class defines a custom 
-    Gym environment that models the dynamics of quantum systems 
-    under various control pulses, and uses RL algorithms to optimize the 
+    Class for storing a control problem and implementing quantum optimal
+    control using reinforcement learning. This class defines a custom
+    Gym environment that models the dynamics of quantum systems
+    under various control pulses, and uses RL algorithms to optimize the
     parameters of these pulses.
     """
 
@@ -39,33 +40,37 @@ def __init__(
         qtrl_optimizers,
     ):
         """
-        Initialize the reinforcement learning environment for quantum 
-        optimal control. Sets up the system Hamiltonian, control parameters, 
+        Initialize the reinforcement learning environment for quantum
+        optimal control. Sets up the system Hamiltonian, control parameters,
         and defines the observation and action spaces for the RL agent.
         """
 
-        super(_RL,self).__init__()
-        
+        super(_RL, self).__init__()
+
         self._Hd_lst, self._Hc_lst = [], []
         for objective in objectives:
             # extract drift and control Hamiltonians from the objective
             self._Hd_lst.append(objective.H[0])
-            self._Hc_lst.append([H[0] if isinstance(H, list) else H for H in objective.H[1:]])
+            self._Hc_lst.append(
+                [H[0] if isinstance(H, list) else H for H in objective.H[1:]]
+            )
 
         def create_pulse_func(idx):
             """
             Create a control pulse lambda function for a given index.
             """
-            return lambda t, args: self._pulse(t, args, idx+1)
-        
-        # create the QobjEvo with Hd, Hc and controls(args)        
+            return lambda t, args: self._pulse(t, args, idx + 1)
+
+        # create the QobjEvo with Hd, Hc and controls(args)
         self._H_lst = [self._Hd_lst[0]]
-        dummy_args = {f'alpha{i+1}': 1.0 for i in range(len(self._Hc_lst[0]))}
+        dummy_args = {f"alpha{i+1}": 1.0 for i in range(len(self._Hc_lst[0]))}
         for i, Hc in enumerate(self._Hc_lst[0]):
             self._H_lst.append([Hc, create_pulse_func(i)])
         self._H = qt.QobjEvo(self._H_lst, args=dummy_args)
 
-        self.shorter_pulses = alg_kwargs.get("shorter_pulses", False) # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps           
+        self.shorter_pulses = alg_kwargs.get(
+            "shorter_pulses", False
+        )  # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps
 
         # extract bounds for control_parameters
         bounds = []
@@ -82,27 +87,29 @@ def create_pulse_func(idx):
         self._dim = self._initial.shape[0]
 
         self._result = Result(
-            objectives = objectives,
-            time_interval = time_interval,
-            start_local_time = time.localtime(),    # initial optimization time
-            n_iters = 0,                            # Number of iterations(episodes) until convergence 
-            iter_seconds = [],                      # list containing the time taken for each iteration(episode) of the optimization
-            var_time = True,                        # Whether the optimization was performed with variable time
-            guess_params=[]
+            objectives=objectives,
+            time_interval=time_interval,
+            start_local_time=time.localtime(),  # initial optimization time
+            n_iters=0,  # Number of iterations(episodes) until convergence
+            iter_seconds=[],  # list containing the time taken for each iteration(episode) of the optimization
+            var_time=True,  # Whether the optimization was performed with variable time
+            guess_params=[],
+        )
+
+        self._backup_result = Result(  # used as a backup in case the algorithm with shorter_pulses does not find an episode with infid<target_infid
+            objectives=objectives,
+            time_interval=time_interval,
+            start_local_time=time.localtime(),
+            n_iters=0,
+            iter_seconds=[],
+            var_time=True,
+            guess_params=[],
         )
-      
-        self._backup_result = Result(                              # used as a backup in case the algorithm with shorter_pulses does not find an episode with infid<target_infid
-            objectives = objectives,
-            time_interval = time_interval,
-            start_local_time = time.localtime(),
-            n_iters = 0,
-            iter_seconds = [],
-            var_time = True,
-            guess_params=[]
+        self._use_backup_result = (
+            False  # if true, use self._backup_result as the final optimization result
         )
-        self._use_backup_result = False                             # if true, use self._backup_result as the final optimization result
 
-        #for the reward
+        # for the reward
         self._step_penalty = 1
 
         # To check if it exceeds the maximum number of steps in an episode
@@ -110,36 +117,44 @@ def create_pulse_func(idx):
 
         self.terminated = False
         self.truncated = False
-        self._episode_info = []                # to contain some information from the latest episode
+        self._episode_info = []  # to contain some information from the latest episode
 
         self._fid_err_targ = alg_kwargs["fid_err_targ"]
 
         # inferred attributes
         self._norm_fac = 1 / self._target.norm()
 
-        self._temp_actions = []                  # temporary list to save episode actions
-        self._actions = []                       # list of actions(lists) of the last episode
+        self._temp_actions = []  # temporary list to save episode actions
+        self._actions = []  # list of actions(lists) of the last episode
 
         # integrator options
         self._integrator_kwargs = integrator_kwargs
         self._rtol = self._integrator_kwargs.get("rtol", 1e-5)
         self._atol = self._integrator_kwargs.get("atol", 1e-5)
 
-        self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
-        self.max_steps = time_interval.n_tslots                         # maximum number of steps in an episode
-        self._step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
-        self.max_episodes = alg_kwargs["max_iter"]                      # maximum number of episodes for training
-        self._total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
-        self.current_episode = 0                                        # To keep track of the current episode
-        
+        self.max_episode_time = time_interval.evo_time  # maximum time for an episode
+        self.max_steps = time_interval.n_tslots  # maximum number of steps in an episode
+        self._step_duration = (
+            time_interval.tslots[-1] / time_interval.n_tslots
+        )  # step duration for mesvole
+        self.max_episodes = alg_kwargs[
+            "max_iter"
+        ]  # maximum number of episodes for training
+        self._total_timesteps = self.max_episodes * self.max_steps  # for learn() of gym
+        self.current_episode = 0  # To keep track of the current episode
+
         # Define action and observation spaces (Gym)
         if self._initial.isket:
             obs_shape = (2 * self._dim,)
-        else:   # for unitary operators 
+        else:  # for unitary operators
             obs_shape = (2 * self._dim * self._dim,)
-        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)     # Continuous action space from -1 to +1, as suggested from gym
-        self.observation_space = spaces.Box(low=-1, high=1, shape=obs_shape, dtype=np.float32)              # Observation space
-        
+        self.action_space = spaces.Box(
+            low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32
+        )  # Continuous action space from -1 to +1, as suggested from gym
+        self.observation_space = spaces.Box(
+            low=-1, high=1, shape=obs_shape, dtype=np.float32
+        )  # Observation space
+
         # create the solver
         if self._Hd_lst[0].issuper:
             self._fid_type = self._alg_kwargs.get("fid_type", "TRACEDIFF")
@@ -154,7 +169,7 @@ def _pulse(self, t, args, idx):
         """
         alpha = args[f"alpha{idx}"]
         return alpha
-    
+
     def _save_episode_info(self):
         """
         Save the information of the last episode before resetting the environment.
@@ -165,7 +180,7 @@ def _save_episode_info(self):
             "terminated": self.terminated,
             "truncated": self.truncated,
             "steps_used": self._current_step,
-            "elapsed_time": time.mktime(time.localtime())
+            "elapsed_time": time.mktime(time.localtime()),
         }
         self._episode_info.append(episode_data)
 
@@ -194,10 +209,14 @@ def _infid(self, args):
 
     def step(self, action):
         """
-        Perform a single time step in the environment, applying the scaled action (control pulse) 
+        Perform a single time step in the environment, applying the scaled action (control pulse)
         chosen by the RL agent. Updates the system's state and computes the reward.
         """
-        alphas = [((action[i] + 1) / 2 * (self._ubound[0] - self._lbound[0])) + self._lbound[0] for i in range(len(action))] 
+        alphas = [
+            ((action[i] + 1) / 2 * (self._ubound[0] - self._lbound[0]))
+            + self._lbound[0]
+            for i in range(len(action))
+        ]
 
         args = {f"alpha{i+1}": value for i, value in enumerate(alphas)}
         _infidelity = self._infid(args)
@@ -207,31 +226,41 @@ def step(self, action):
         self._result.infidelity = _infidelity
         reward = (1 - _infidelity) - self._step_penalty
 
-        self.terminated = _infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
-        self.truncated = self._current_step >= self.max_steps                     # if the episode ended without reaching the goal
+        self.terminated = (
+            _infidelity <= self._fid_err_targ
+        )  # the episode ended reaching the goal
+        self.truncated = (
+            self._current_step >= self.max_steps
+        )  # if the episode ended without reaching the goal
 
         observation = self._get_obs()
         return observation, reward, bool(self.terminated), bool(self.truncated), {}
 
     def _get_obs(self):
         """
-        Get the current state observation for the RL agent. Converts the system's 
+        Get the current state observation for the RL agent. Converts the system's
         quantum state or matrix into a real-valued NumPy array suitable for RL algorithms.
         """
         rho = self._state.full().flatten()
         obs = np.concatenate((np.real(rho), np.imag(rho)))
-        return obs.astype(np.float32)                                       # Gymnasium expects the observation to be of type float32
-    
+        return obs.astype(
+            np.float32
+        )  # Gymnasium expects the observation to be of type float32
+
     def reset(self, seed=None):
         """
         Reset the environment to the initial state, preparing for a new episode.
         """
         self._save_episode_info()
 
-        time_diff = self._episode_info[-1]["elapsed_time"] - (self._episode_info[-2]["elapsed_time"] if len(self._episode_info) > 1 else time.mktime(self._result.start_local_time))
+        time_diff = self._episode_info[-1]["elapsed_time"] - (
+            self._episode_info[-2]["elapsed_time"]
+            if len(self._episode_info) > 1
+            else time.mktime(self._result.start_local_time)
+        )
         self._result.iter_seconds.append(time_diff)
-        self._current_step = 0                                           # Reset the step counter
-        self.current_episode += 1                                       # Increment episode counter
+        self._current_step = 0  # Reset the step counter
+        self.current_episode += 1  # Increment episode counter
         self._actions = self._temp_actions.copy()
         self.terminated = False
         self.truncated = False
@@ -239,61 +268,74 @@ def reset(self, seed=None):
         self._result._final_states = [self._state]
         self._state = self._initial
         return self._get_obs(), {}
-    
+
     def _save_result(self):
         """
-        Save the results of the optimization process, including the optimized 
+        Save the results of the optimization process, including the optimized
         pulse sequences, final states, and performance metrics.
         """
         result_obj = self._backup_result if self._use_backup_result else self._result
 
-        if(self._use_backup_result):
+        if self._use_backup_result:
             self._backup_result.iter_seconds = self._result.iter_seconds.copy()
             self._backup_result._final_states = self._result._final_states.copy()
             self._backup_result.infidelity = self._result.infidelity
 
         result_obj.end_local_time = time.localtime()
-        result_obj.n_iters = len(self._result.iter_seconds)  
-        result_obj.optimized_params = self._actions.copy() + [self._result.total_seconds]                      # If var_time is True, the last parameter is the evolution time
+        result_obj.n_iters = len(self._result.iter_seconds)
+        result_obj.optimized_params = self._actions.copy() + [
+            self._result.total_seconds
+        ]  # If var_time is True, the last parameter is the evolution time
         result_obj._optimized_controls = self._actions.copy()
         result_obj._guess_controls = []
         result_obj._optimized_H = [self._H]
 
-
     def result(self):
         """
         Final conversions and return of optimization results
-        """            
+        """
         if self._use_backup_result:
-            self._backup_result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._backup_result.start_local_time)       # Convert to a string
-            self._backup_result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._backup_result.end_local_time)           # Convert to a string
+            self._backup_result.start_local_time = time.strftime(
+                "%Y-%m-%d %H:%M:%S", self._backup_result.start_local_time
+            )  # Convert to a string
+            self._backup_result.end_local_time = time.strftime(
+                "%Y-%m-%d %H:%M:%S", self._backup_result.end_local_time
+            )  # Convert to a string
             return self._backup_result
         else:
             self._save_result()
-            self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
-            self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
+            self._result.start_local_time = time.strftime(
+                "%Y-%m-%d %H:%M:%S", self._result.start_local_time
+            )  # Convert to a string
+            self._result.end_local_time = time.strftime(
+                "%Y-%m-%d %H:%M:%S", self._result.end_local_time
+            )  # Convert to a string
             return self._result
 
     def train(self):
         """
-        Train the RL agent on the defined quantum control problem using the specified 
+        Train the RL agent on the defined quantum control problem using the specified
         reinforcement learning algorithm. Checks environment compatibility with Gym API.
         """
         # Check if the environment follows Gym API
         check_env(self, warn=True)
 
         # Create the model
-        model = PPO('MlpPolicy', self, verbose=1)       # verbose = 1 to display training progress and statistics in the terminal
-        
+        model = PPO(
+            "MlpPolicy", self, verbose=1
+        )  # verbose = 1 to display training progress and statistics in the terminal
+
         stop_callback = EarlyStopTraining(verbose=1)
-        
+
         # Train the model
-        model.learn(total_timesteps = self._total_timesteps, callback=stop_callback)
+        model.learn(total_timesteps=self._total_timesteps, callback=stop_callback)
+
 
 class EarlyStopTraining(BaseCallback):
     """
     A callback to stop training based on specific conditions (steps, infidelity, max iterations)
     """
+
     def __init__(self, verbose: int = 0):
         super(EarlyStopTraining, self).__init__(verbose)
 
@@ -304,31 +346,40 @@ def _on_step(self) -> bool:
         - Stop training if it finds an episode with infidelity <= than target infidelity
         - If all of the last 100 episodes have infidelity below the target and use the same number of steps, stop training.
         """
-        env = self.training_env.envs[0].unwrapped
+        env = self.training_env.get_attr("unwrapped")[0]
 
         # Check if we need to stop training
         if env.current_episode >= env.max_episodes:
-            if env._use_backup_result == True:
+            if env._use_backup_result is True:
                 env._backup_result.message = f"Reached {env.max_episodes} episodes, stopping training. Return the last founded episode with infid < target_infid"
             else:
-                env._result.message = f"Reached {env.max_episodes} episodes, stopping training."
-            return False    # Stop training
-        elif (env._result.infidelity <= env._fid_err_targ) and not(env.shorter_pulses):
-            env._result.message = f"Stop training because an episode with infidelity <= target infidelity was found"
+                env._result.message = (
+                    f"Reached {env.max_episodes} episodes, stopping training."
+                )
+            return False  # Stop training
+        elif (env._result.infidelity <= env._fid_err_targ) and not (env.shorter_pulses):
+            env._result.message = "Stop training because an episode with infidelity <= target infidelity was found"
             return False  # Stop training
         elif env.shorter_pulses:
-            if(env._result.infidelity <= env._fid_err_targ):        # if it finds an episode with infidelity lower than target infidelity, I'll save it in the meantime
+            if (
+                env._result.infidelity <= env._fid_err_targ
+            ):  # if it finds an episode with infidelity lower than target infidelity, I'll save it in the meantime
                 env._use_backup_result = True
                 env._save_result()
             if len(env._episode_info) >= 100:
                 last_100_episodes = env._episode_info[-100:]
 
-                min_steps = min(info['steps_used'] for info in last_100_episodes)
-                steps_condition = all(ep['steps_used'] == min_steps for ep in last_100_episodes)
-                infid_condition = all(ep['final_infidelity'] <= env._fid_err_targ for ep in last_100_episodes)
+                min_steps = min(info["steps_used"] for info in last_100_episodes)
+                steps_condition = all(
+                    ep["steps_used"] == min_steps for ep in last_100_episodes
+                )
+                infid_condition = all(
+                    ep["final_infidelity"] <= env._fid_err_targ
+                    for ep in last_100_episodes
+                )
 
                 if steps_condition and infid_condition:
                     env._use_backup_result = False
                     env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
                     return False  # Stop training
-        return True  # Continue training
\ No newline at end of file
+        return True  # Continue training
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index d320799..9e8ecc3 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -79,7 +79,7 @@ def optimize_pulses(
 
             - max_iter : int, optional
                 Maximum number of iterations to perform.
-                Referes to local minimizer steps or in the context of 
+                Referes to local minimizer steps or in the context of
                 `alg: "RL"` to the max. number of episodes.
                 Global steps default to 0 (no global optimization).
                 Can be overridden by specifying in minimizer_kwargs.
diff --git a/tests/test_result.py b/tests/test_result.py
index 7b9cda3..7146b04 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -156,18 +156,18 @@ def sin_z_jax(t, r, **kwargs):
 
 # state to state transfer
 initial = qt.basis(2, 0)
-target = (qt.basis(2, 0) + qt.basis(2, 1)).unit() # |+⟩
+target = (qt.basis(2, 0) + qt.basis(2, 1)).unit()  # |+⟩
 
-H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()] # control Hamiltonians
+H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()]  # control Hamiltonians
 
 w, d, y = 0.1, 1.0, 0.1
-H_d = 1 / 2 * (w * qt.sigmaz() + d * qt.sigmax()) # drift Hamiltonian
+H_d = 1 / 2 * (w * qt.sigmaz() + d * qt.sigmax())  # drift Hamiltonian
 
-H = [H_d] + H_c # total Hamiltonian
+H = [H_d] + H_c  # total Hamiltonian
 
 state2state_rl = Case(
     objectives=[Objective(initial, H, target)],
-    control_parameters = {
+    control_parameters={
         "p": {"bounds": [(-13, 13)]},
     },
     tlist=np.linspace(0, 10, 100),
@@ -177,17 +177,17 @@ def sin_z_jax(t, r, **kwargs):
         "max_iter": 20000,
         "shorter_pulses": True,
     },
-    optimizer_kwargs={}
+    optimizer_kwargs={},
 )
 
 # no big difference for unitary evolution
 
-initial = qt.qeye(2) # Identity
-target  = qt.gates.hadamard_transform()
+initial = qt.qeye(2)  # Identity
+target = qt.gates.hadamard_transform()
 
 unitary_rl = state2state_rl._replace(
     objectives=[Objective(initial, H, target)],
-        control_parameters = {
+    control_parameters={
         "p": {"bounds": [(-13, 13)]},
     },
     algorithm_kwargs={

From c115a9821b572e9f818e29f6841c5c28dbee9626 Mon Sep 17 00:00:00 2001
From: flowerthrower <flowerthrower@users.noreply.github.com>
Date: Fri, 4 Oct 2024 09:03:19 +0200
Subject: [PATCH 29/52] update changelog

---
 doc/changelog.rst | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/doc/changelog.rst b/doc/changelog.rst
index f2c5afe..2e869b0 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -3,6 +3,30 @@ Changelog
 *********
 
 
+Version 0.0.2 (Oct 04, 2024)
++++++++++++++++++++++++++++++++++
+
+This is an update to the beta release of ``qutip-qoc``.
+
+It mainly introduces the new reinforcement learning algorithm ``qutip_qoc._rl``.
+
+- Non-public facing functions have been renamed to start with an underscore.
+- As with other QuTiP functions, ``optimize_pulses`` now takes a ``tlist`` argument instead of ``_TimeInterval``.
+- The structure for the control guess and bounds has changed and now takes in an optional ``__time__`` keyword.
+- The ``result`` does no longer return ``optimized_objectives`` but instead ``optimized_H``.
+
+Features
+--------
+
+- New reinforcement learning algorithm, developed during GSOC24 (#19, #18, by LegionAtol)
+- Automatic transfromation of initial and target operator to superoperator (#23, by flowerthrower)
+
+Bug Fixes
+---------
+
+- Prevent loss of `__time__` keyword in optimize_pulses (#22, by flowerthrower)
+
+
 Version 0.0.1 (May xx, 2024)
 +++++++++++++++++++++++++++++++++
 

From fba6e250aa8080fc8e27c2c79faa6016874b2897 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 29 Oct 2024 14:01:22 +0000
Subject: [PATCH 30/52] Bump certifi in /doc in the pip group across 1
 directory

Bumps the pip group with 1 update in the /doc directory: [certifi](https://github.com/certifi/python-certifi).


Updates `certifi` from 2020.12.5 to 2024.7.4
- [Commits](https://github.com/certifi/python-certifi/compare/2020.12.05...2024.07.04)

---
updated-dependencies:
- dependency-name: certifi
  dependency-type: direct:production
  dependency-group: pip
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 doc/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/requirements.txt b/doc/requirements.txt
index 1cf3d0a..f731421 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -9,7 +9,7 @@ matplotlib>=3.6.1
 docutils==0.18.1
 alabaster==0.7.12
 Babel==2.9.1
-certifi==2020.12.5
+certifi==2024.7.4
 chardet==4.0.0
 colorama==0.4.4
 idna==2.10

From 2ef2db21b9b31e8a9c8cfaebbacbf3ae9c88acef Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 29 Oct 2024 14:01:22 +0000
Subject: [PATCH 31/52] Bump urllib3 in /doc in the pip group across 1
 directory

Bumps the pip group with 1 update in the /doc directory: [urllib3](https://github.com/urllib3/urllib3).


Updates `urllib3` from 1.26.4 to 1.26.19
- [Release notes](https://github.com/urllib3/urllib3/releases)
- [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst)
- [Commits](https://github.com/urllib3/urllib3/compare/1.26.4...1.26.19)

---
updated-dependencies:
- dependency-name: urllib3
  dependency-type: direct:production
  dependency-group: pip
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 doc/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/requirements.txt b/doc/requirements.txt
index 1cf3d0a..6885b8c 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -32,4 +32,4 @@ sphinxcontrib-htmlhelp==2.0.1
 sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.3
 sphinxcontrib-serializinghtml==1.1.5
-urllib3==1.26.4
+urllib3==1.26.19

From da869f678b613e851c7ce9e58294cd6d3081e9d1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 29 Oct 2024 14:01:25 +0000
Subject: [PATCH 32/52] Bump requests in /doc in the pip group across 1
 directory

Bumps the pip group with 1 update in the /doc directory: [requests](https://github.com/psf/requests).


Updates `requests` from 2.25.1 to 2.32.2
- [Release notes](https://github.com/psf/requests/releases)
- [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md)
- [Commits](https://github.com/psf/requests/compare/v2.25.1...v2.32.2)

---
updated-dependencies:
- dependency-name: requests
  dependency-type: direct:production
  dependency-group: pip
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 doc/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/requirements.txt b/doc/requirements.txt
index 1cf3d0a..e1030ad 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -20,7 +20,7 @@ packaging==23.2
 Pygments==2.17.2
 pyparsing==2.4.7
 pytz==2021.1
-requests==2.25.1
+requests==2.32.2
 snowballstemmer==2.1.0
 Sphinx==6.1.3
 sphinx-gallery==0.12.2

From ec376e560dc0641cdf61582bccf7c3516d205025 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Wed, 18 Dec 2024 11:41:16 +0100
Subject: [PATCH 33/52] corrections in algorithm execution time

---
 src/qutip_qoc/_rl.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 9a75a55..21e8cb1 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -89,7 +89,7 @@ def create_pulse_func(idx):
         self._result = Result(
             objectives=objectives,
             time_interval=time_interval,
-            start_local_time=time.localtime(),  # initial optimization time
+            start_local_time=time.time(),  # initial optimization time
             n_iters=0,  # Number of iterations(episodes) until convergence
             iter_seconds=[],  # list containing the time taken for each iteration(episode) of the optimization
             var_time=True,  # Whether the optimization was performed with variable time
@@ -99,7 +99,7 @@ def create_pulse_func(idx):
         self._backup_result = Result(  # used as a backup in case the algorithm with shorter_pulses does not find an episode with infid<target_infid
             objectives=objectives,
             time_interval=time_interval,
-            start_local_time=time.localtime(),
+            start_local_time=time.time(),
             n_iters=0,
             iter_seconds=[],
             var_time=True,
@@ -180,7 +180,7 @@ def _save_episode_info(self):
             "terminated": self.terminated,
             "truncated": self.truncated,
             "steps_used": self._current_step,
-            "elapsed_time": time.mktime(time.localtime()),
+            "elapsed_time": time.time(),
         }
         self._episode_info.append(episode_data)
 
@@ -256,7 +256,7 @@ def reset(self, seed=None):
         time_diff = self._episode_info[-1]["elapsed_time"] - (
             self._episode_info[-2]["elapsed_time"]
             if len(self._episode_info) > 1
-            else time.mktime(self._result.start_local_time)
+            else self._result.start_local_time
         )
         self._result.iter_seconds.append(time_diff)
         self._current_step = 0  # Reset the step counter
@@ -281,7 +281,7 @@ def _save_result(self):
             self._backup_result._final_states = self._result._final_states.copy()
             self._backup_result.infidelity = self._result.infidelity
 
-        result_obj.end_local_time = time.localtime()
+        result_obj.end_local_time = time.time()
         result_obj.n_iters = len(self._result.iter_seconds)
         result_obj.optimized_params = self._actions.copy() + [
             self._result.total_seconds
@@ -296,20 +296,20 @@ def result(self):
         """
         if self._use_backup_result:
             self._backup_result.start_local_time = time.strftime(
-                "%Y-%m-%d %H:%M:%S", self._backup_result.start_local_time
-            )  # Convert to a string
+                "%Y-%m-%d %H:%M:%S", time.localtime(self._backup_result.start_local_time)
+            )
             self._backup_result.end_local_time = time.strftime(
-                "%Y-%m-%d %H:%M:%S", self._backup_result.end_local_time
-            )  # Convert to a string
+                "%Y-%m-%d %H:%M:%S", time.localtime(self._backup_result.end_local_time)
+            )
             return self._backup_result
         else:
             self._save_result()
             self._result.start_local_time = time.strftime(
-                "%Y-%m-%d %H:%M:%S", self._result.start_local_time
-            )  # Convert to a string
+                "%Y-%m-%d %H:%M:%S", time.localtime(self._result.start_local_time)
+            )
             self._result.end_local_time = time.strftime(
-                "%Y-%m-%d %H:%M:%S", self._result.end_local_time
-            )  # Convert to a string
+                "%Y-%m-%d %H:%M:%S", time.localtime(self._result.end_local_time)
+            )
             return self._result
 
     def train(self):

From 9d7f277d17b7065a6abb904a29f1f882c7393af4 Mon Sep 17 00:00:00 2001
From: Paul Menczel <paul@menczel.net>
Date: Thu, 23 Jan 2025 13:11:24 +0900
Subject: [PATCH 34/52] Make all JAX and machine learning related dependencies
 optional

---
 README.md                       |   3 +
 doc/installation.rst            |  14 ++-
 requirements.txt                |   5 --
 setup.cfg                       |  14 +--
 src/qutip_qoc/_jopt.py          |  59 +++++++-----
 src/qutip_qoc/pulse_optim.py    |  13 ++-
 src/qutip_qoc/result.py         |  13 ++-
 tests/test_analytical_pulses.py |  87 ++++++++++--------
 tests/test_fidelity.py          |  91 +++++++++++--------
 tests/test_result.py            | 153 +++++++++++++++++++-------------
 10 files changed, 277 insertions(+), 175 deletions(-)

diff --git a/README.md b/README.md
index 9dde70f..8355177 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,9 @@ To install the package, use
 pip install qutip-qoc
 ```
 
+By default, the dependencies required for JOPT and for the RL (reinforcement learning) algorithm are omitted.
+They can be included by using the targets `qutip-qoc[jopt]` and `qutip-qoc[rl]`, respectively (or `qutip-qoc[full]` for all dependencies).
+
 ## Documentation and tutorials
 
 The documentation of `qutip-qoc` updated to the latest development version is hosted at [qutip-qoc.readthedocs.io](https://qutip-qoc.readthedocs.io/en/latest/).
diff --git a/doc/installation.rst b/doc/installation.rst
index f619143..5754564 100644
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -23,7 +23,19 @@ In particular, the following packages are necessary for running ``qutip-qoc``:
 
 .. code-block:: bash
 
-    numpy scipy jax jaxlib cython qutip qutip-jax qutip-qtrl
+    numpy scipy cython qutip qutip-qtrl
+
+The following packages are required for using the JOPT algorithm:
+
+.. code-block:: bash
+
+    jax jaxlib qutip-jax
+
+The following packages are required for the RL (reinforcement learning) algorithm:
+
+.. code-block:: bash
+
+    gymnasium stable-baselines3
 
 The following package is used for testing:
 
diff --git a/requirements.txt b/requirements.txt
index 50da21d..7c55daa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,6 @@
 cython>=1.0
 numpy>=1.16.6,<2.0
 scipy>=1.10.1
-jax==0.4.28
-jaxlib==0.4.28
 qutip>=5.0.1
 qutip-qtrl
-qutip-jax
 pre-commit
-gymnasium>=0.29.1
-stable-baselines3>=2.3.2
diff --git a/setup.cfg b/setup.cfg
index 7872023..2547088 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -28,15 +28,10 @@ package_dir=
 packages = find:
 include_package_data = True
 install_requires =
-    jax
-    jaxlib
     packaging
     qutip
     qutip-qtrl
-    qutip-jax
     numpy>=1.16.6,<2.0
-    gymnasium>=0.29.1
-    stable-baselines3>=2.3.2
 setup_requires =
     cython>=1.0
     packaging
@@ -45,7 +40,16 @@ setup_requires =
 where = src
 
 [options.extras_require]
+jopt =
+    jax
+    jaxlib
+    qutip-jax
+rl =
+    gymnasium>=0.29.1
+    stable-baselines3>=2.3.2
 tests =
     pytest>=6.0
 full =
+    %(jopt)s
+    %(rl)s
     %(tests)s
diff --git a/src/qutip_qoc/_jopt.py b/src/qutip_qoc/_jopt.py
index 0e94491..6274076 100644
--- a/src/qutip_qoc/_jopt.py
+++ b/src/qutip_qoc/_jopt.py
@@ -5,38 +5,47 @@
 import qutip as qt
 from qutip import Qobj, QobjEvo
 
-from diffrax import Dopri5, PIDController
+try:
+    import jax
+    from jax import custom_jvp
+    import jax.numpy as jnp
+    import qutip_jax  # noqa: F401
 
-import jax
-from jax import custom_jvp
-import jax.numpy as jnp
-import qutip_jax  # noqa: F401
+    import jaxlib  # noqa: F401
 
+    from diffrax import Dopri5, PIDController
 
-@custom_jvp
-def _abs(x):
-    return jnp.abs(x)
+    _jax_available = True
+except ImportError:
+    _jax_available = False
 
 
-def _abs_jvp(primals, tangents):
-    """
-    Custom jvp for absolute value of complex functions
-    """
-    (x,) = primals
-    (t,) = tangents
+if _jax_available:
+
+    @custom_jvp
+    def _abs(x):
+        return jnp.abs(x)
+
 
-    abs_x = _abs(x)
-    res = jnp.where(
-        abs_x == 0,
-        0.0,  # prevent division by zero
-        jnp.real(jnp.multiply(jnp.conj(x), t)) / abs_x,
-    )
+    def _abs_jvp(primals, tangents):
+        """
+        Custom jvp for absolute value of complex functions
+        """
+        (x,) = primals
+        (t,) = tangents
+
+        abs_x = _abs(x)
+        res = jnp.where(
+            abs_x == 0,
+            0.0,  # prevent division by zero
+            jnp.real(jnp.multiply(jnp.conj(x), t)) / abs_x,
+        )
 
-    return abs_x, res
+        return abs_x, res
 
 
-# register custom jvp for absolut value of complex functions
-_abs.defjvp(_abs_jvp)
+    # register custom jvp for absolut value of complex functions
+    _abs.defjvp(_abs_jvp)
 
 
 class _JOPT:
@@ -55,6 +64,10 @@ def __init__(
         guess_params,
         **integrator_kwargs,
     ):
+        if not _jax_available:
+            raise ImportError("The JOPT algorithm requires the modules jax, "
+                              "jaxlib, and qutip_jax to be installed.")
+
         self._Hd = objective.H[0]
         self._Hc_lst = objective.H[1:]
 
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 5f65e46..94d424c 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -10,7 +10,12 @@
 
 from qutip_qoc._optimizer import _global_local_optimization
 from qutip_qoc._time import _TimeInterval
-from qutip_qoc._rl import _RL
+
+try:
+    from qutip_qoc._rl import _RL
+    _rl_available = True
+except ImportError:
+    _rl_available = False
 
 __all__ = ["optimize_pulses"]
 
@@ -353,6 +358,12 @@ def optimize_pulses(
             qtrl_optimizers.append(qtrl_optimizer)
 
     elif alg == "RL":
+        if not _rl_available:
+            raise ImportError(
+                "The required dependencies (gymnasium, stable-baselines3) for "
+                "the reinforcement learning algorithm are not available."
+            )
+
         rl_env = _RL(
             objectives,
             control_parameters,
diff --git a/src/qutip_qoc/result.py b/src/qutip_qoc/result.py
index 493104a..ef74941 100644
--- a/src/qutip_qoc/result.py
+++ b/src/qutip_qoc/result.py
@@ -2,7 +2,6 @@
 This module contains the Result class for storing and
 reporting the results of a full pulse control optimization run.
 """
-import jaxlib
 import pickle
 import textwrap
 import numpy as np
@@ -11,6 +10,12 @@
 
 import qutip as qt
 
+try:
+    import jaxlib
+    _jitfun_type = jaxlib.xla_extension.PjitFunction
+except ImportError:
+    _jitfun_type = None
+
 __all__ = ["Result"]
 
 
@@ -331,8 +336,10 @@ def final_states(self):
                 evo_time = self.time_interval.evo_time
 
             # choose solver method based on type of control function
-            if isinstance(
-                self.objectives[0].H[1][1], jaxlib.xla_extension.PjitFunction
+            # if jax is installed, _jitfun_type is set to
+            # jaxlib.xla_extension.PjitFunction, otherwise it is None
+            if _jitfun_type is not None and isinstance(
+                self.objectives[0].H[1][1], _jitfun_type
             ):
                 method = "diffrax"  # for JAX defined contols
             else:
diff --git a/tests/test_analytical_pulses.py b/tests/test_analytical_pulses.py
index 616106b..ecc1417 100644
--- a/tests/test_analytical_pulses.py
+++ b/tests/test_analytical_pulses.py
@@ -4,11 +4,16 @@
 
 import pytest
 import qutip as qt
-import qutip_jax  # noqa: F401
 import numpy as np
-import jax.numpy as jnp
 import collections
 
+try:
+    import jax.numpy as jnp
+    import qutip_jax  # noqa: F401
+    _jax_available = True
+except ImportError:
+    _jax_available = False
+
 from qutip_qoc.pulse_optim import optimize_pulses
 from qutip_qoc.objective import Objective
 
@@ -139,52 +144,59 @@ def grad_sin(t, p, idx):
 )
 
 
-# ----------------------- System and JAX Control ---------------------
+if _jax_available:
+    # ----------------------- System and JAX Control ---------------------
 
 
-def sin_jax(t, p):
-    return p[0] * jnp.sin(p[1] * t + p[2])
+    def sin_jax(t, p):
+        return p[0] * jnp.sin(p[1] * t + p[2])
 
 
-Hc_jax = [
-    [qt.sigmax(), lambda t, p: sin_jax(t, p)],
-    [qt.sigmay(), lambda t, q: sin_jax(t, q)],
-]
+    Hc_jax = [
+        [qt.sigmax(), lambda t, p: sin_jax(t, p)],
+        [qt.sigmay(), lambda t, q: sin_jax(t, q)],
+    ]
 
-H_jax = H_d + Hc_jax
+    H_jax = H_d + Hc_jax
 
-# ------------------------------- Objective -------------------------------
+    # ------------------------------- Objective -------------------------------
 
-# state to state transfer
-state2state_jax = state2state._replace(
-    objectives=[Objective(initial, H_jax, target)],
-    algorithm_kwargs={"alg": "JOPT", "fid_err_targ": 0.01},
-)
+    # state to state transfer
+    state2state_jax = state2state._replace(
+        objectives=[Objective(initial, H_jax, target)],
+        algorithm_kwargs={"alg": "JOPT", "fid_err_targ": 0.01},
+    )
 
-# unitary gate synthesis
-unitary_jax = unitary._replace(
-    objectives=[Objective(initial_U, H_jax, target_U)],
-    algorithm_kwargs={"alg": "JOPT", "fid_err_targ": 0.01},
-)
+    # unitary gate synthesis
+    unitary_jax = unitary._replace(
+        objectives=[Objective(initial_U, H_jax, target_U)],
+        algorithm_kwargs={"alg": "JOPT", "fid_err_targ": 0.01},
+    )
 
-# unitary gate synthesis - time optimization
-time_jax = time._replace(
-    objectives=[Objective(initial_U, H_jax, target_U)],
-    algorithm_kwargs={"alg": "JOPT", "fid_err_targ": 0.01},
-)
+    # unitary gate synthesis - time optimization
+    time_jax = time._replace(
+        objectives=[Objective(initial_U, H_jax, target_U)],
+        algorithm_kwargs={"alg": "JOPT", "fid_err_targ": 0.01},
+    )
 
-# map synthesis
-Lc_jax = [
-    [qt.liouvillian(qt.sigmax()), lambda t, p: sin_jax(t, p)],
-    [qt.liouvillian(qt.sigmay()), lambda t, q: sin_jax(t, q)],
-]
-L_jax = L_d + Lc_jax
+    # map synthesis
+    Lc_jax = [
+        [qt.liouvillian(qt.sigmax()), lambda t, p: sin_jax(t, p)],
+        [qt.liouvillian(qt.sigmay()), lambda t, q: sin_jax(t, q)],
+    ]
+    L_jax = L_d + Lc_jax
 
-mapping_jax = mapping._replace(
-    objectives=[Objective(initial_map, L_jax, target_map)],
-    algorithm_kwargs={"alg": "JOPT", "fid_err_targ": 0.1},  # relaxed objective
-)
+    mapping_jax = mapping._replace(
+        objectives=[Objective(initial_map, L_jax, target_map)],
+        algorithm_kwargs={"alg": "JOPT", "fid_err_targ": 0.1},  # relaxed objective
+    )
 
+else:
+    # jax not available, set these to none so tests will be skipped
+    state2state_jax = None
+    unitary_jax = None
+    time_jax = None
+    mapping_jax = None
 
 @pytest.fixture(
     params=[
@@ -205,6 +217,9 @@ def tst(request):
 
 
 def test_optimize_pulses(tst):
+    if tst is None:
+        pytest.skip("JAX not available")
+
     result = optimize_pulses(
         tst.objectives,
         tst.control_parameters,
diff --git a/tests/test_fidelity.py b/tests/test_fidelity.py
index 8490a3e..2c3509d 100644
--- a/tests/test_fidelity.py
+++ b/tests/test_fidelity.py
@@ -5,9 +5,14 @@
 import pytest
 import qutip as qt
 import numpy as np
-import jax.numpy as jnp
 import collections
 
+try:
+    import jax.numpy as jnp
+    _jax_available = True
+except ImportError:
+    _jax_available = False
+
 from qutip_qoc.pulse_optim import optimize_pulses
 from qutip_qoc.objective import Objective
 
@@ -108,55 +113,64 @@ def grad_sin(t, p, idx):
     algorithm_kwargs={"alg": "GOAT", "fid_type": "TRACEDIFF"},
 )
 
-# ----------------------- System and JAX Control ---------------------
+if _jax_available:
+    # ----------------------- System and JAX Control ---------------------
 
 
-def sin_jax(t, p):
-    return p[0] * jnp.sin(p[1] * t + p[2])
+    def sin_jax(t, p):
+        return p[0] * jnp.sin(p[1] * t + p[2])
 
 
-Hc_jax = [
-    [qt.sigmax(), lambda t, p: sin_jax(t, p)],
-    [qt.sigmay(), lambda t, q: sin_jax(t, q)],
-]
+    Hc_jax = [
+        [qt.sigmax(), lambda t, p: sin_jax(t, p)],
+        [qt.sigmay(), lambda t, q: sin_jax(t, q)],
+    ]
 
-H_jax = H_d + Hc_jax
+    H_jax = H_d + Hc_jax
 
-# ------------------------------- Objective -------------------------------
+    # ------------------------------- Objective -------------------------------
 
-# state to state transfer
-PSU_state2state_jax = PSU_state2state._replace(
-    objectives=[Objective(initial, H_jax, (-1j) * initial)],
-    algorithm_kwargs={"alg": "JOPT"},
-)
+    # state to state transfer
+    PSU_state2state_jax = PSU_state2state._replace(
+        objectives=[Objective(initial, H_jax, (-1j) * initial)],
+        algorithm_kwargs={"alg": "JOPT"},
+    )
 
-SU_state2state_jax = SU_state2state._replace(
-    objectives=[Objective(initial, H_jax, initial)], algorithm_kwargs={"alg": "JOPT"}
-)
+    SU_state2state_jax = SU_state2state._replace(
+        objectives=[Objective(initial, H_jax, initial)], algorithm_kwargs={"alg": "JOPT"}
+    )
 
 
-# unitary gate synthesis
-PSU_unitary_jax = PSU_unitary._replace(
-    objectives=[Objective(initial_U, H_jax, (-1j) * initial_U)],
-    algorithm_kwargs={"alg": "JOPT"},
-)
+    # unitary gate synthesis
+    PSU_unitary_jax = PSU_unitary._replace(
+        objectives=[Objective(initial_U, H_jax, (-1j) * initial_U)],
+        algorithm_kwargs={"alg": "JOPT"},
+    )
 
-SU_unitary_jax = SU_unitary._replace(
-    objectives=[Objective(initial_U, H_jax, initial_U)],
-    algorithm_kwargs={"alg": "JOPT"},
-)
+    SU_unitary_jax = SU_unitary._replace(
+        objectives=[Objective(initial_U, H_jax, initial_U)],
+        algorithm_kwargs={"alg": "JOPT"},
+    )
 
-# map synthesis
-Lc_jax = [
-    [qt.liouvillian(qt.sigmax()), lambda t, p: sin_jax(t, p)],
-    [qt.liouvillian(qt.sigmay()), lambda t, q: sin_jax(t, q)],
-]
-L_jax = L_d + Lc_jax
+    # map synthesis
+    Lc_jax = [
+        [qt.liouvillian(qt.sigmax()), lambda t, p: sin_jax(t, p)],
+        [qt.liouvillian(qt.sigmay()), lambda t, q: sin_jax(t, q)],
+    ]
+    L_jax = L_d + Lc_jax
 
-TRCDIFF_map_jax = TRCDIFF_map._replace(
-    objectives=[Objective(initial_map, L_jax, initial_map)],
-    algorithm_kwargs={"alg": "JOPT", "fid_type": "TRACEDIFF"},
-)
+    TRCDIFF_map_jax = TRCDIFF_map._replace(
+        objectives=[Objective(initial_map, L_jax, initial_map)],
+        algorithm_kwargs={"alg": "JOPT", "fid_type": "TRACEDIFF"},
+    )
+
+else:
+    # jax not available, set these to none so tests will be skipped
+    PSU_state2state_jax = None
+    SU_state2state_jax = None
+    PSU_unitary_jax = None
+    SU_unitary_jax = None
+    TRCDIFF_map_jax = None
 
 
 @pytest.fixture(
@@ -182,6 +196,9 @@ def tst(request):
 
 
 def test_optimize_pulses(tst):
+    if tst is None:
+        pytest.skip("JAX not available")
+
     result = optimize_pulses(
         tst.objectives,
         tst.control_parameters,
diff --git a/tests/test_result.py b/tests/test_result.py
index 7146b04..6bd7f8c 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -5,10 +5,22 @@
 import pytest
 import qutip as qt
 import numpy as np
-import jax
-import jax.numpy as jnp
 import collections
 
+try:
+    import jax
+    import jax.numpy as jnp
+    _jax_available = True
+except ImportError:
+    _jax_available = False
+
+try:
+    import gymnasium
+    import stable_baselines3
+    _rl_available = True
+except ImportError:
+    _rl_available = False
+
 from qutip_qoc.pulse_optim import optimize_pulses
 from qutip_qoc.objective import Objective
 from qutip_qoc._time import _TimeInterval
@@ -84,37 +96,42 @@ def grad_sin(t, p, idx):
     objectives=[Objective(initial, H, target)],
     algorithm_kwargs={"alg": "CRAB", "fid_err_targ": 0.01},
 )
-# ----------------------- JAX ---------------------
 
+if _jax_available:
+    # ----------------------- JAX ---------------------
 
-def sin_jax(t, p):
-    return p[0] * jnp.sin(p[1] * t + p[2])
 
+    def sin_jax(t, p):
+        return p[0] * jnp.sin(p[1] * t + p[2])
 
-@jax.jit
-def sin_x_jax(t, p, **kwargs):
-    return sin_jax(t, p)
 
+    @jax.jit
+    def sin_x_jax(t, p, **kwargs):
+        return sin_jax(t, p)
 
-@jax.jit
-def sin_y_jax(t, q, **kwargs):
-    return sin_jax(t, q)
 
+    @jax.jit
+    def sin_y_jax(t, q, **kwargs):
+        return sin_jax(t, q)
 
-@jax.jit
-def sin_z_jax(t, r, **kwargs):
-    return sin_jax(t, r)
 
+    @jax.jit
+    def sin_z_jax(t, r, **kwargs):
+        return sin_jax(t, r)
 
-Hc_jax = [[qt.sigmax(), sin_x_jax], [qt.sigmay(), sin_y_jax], [qt.sigmaz(), sin_z_jax]]
 
-H_jax = H_d + Hc_jax
+    Hc_jax = [[qt.sigmax(), sin_x_jax], [qt.sigmay(), sin_y_jax], [qt.sigmaz(), sin_z_jax]]
 
-# state to state transfer
-state2state_jax = state2state_goat._replace(
-    objectives=[Objective(initial, H_jax, target)],
-    algorithm_kwargs={"alg": "JOPT", "fid_err_targ": 0.01},
-)
+    H_jax = H_d + Hc_jax
+
+    # state to state transfer
+    state2state_jax = state2state_goat._replace(
+        objectives=[Objective(initial, H_jax, target)],
+        algorithm_kwargs={"alg": "JOPT", "fid_err_targ": 0.01},
+    )
+
+else:
+    state2state_jax = None
 
 
 # ------------------- discrete CRAB / GRAPE  control ------------------------
@@ -152,51 +169,56 @@ def sin_z_jax(t, r, **kwargs):
     algorithm_kwargs={"alg": "CRAB", "fid_err_targ": 0.01, "fix_frequency": False},
 )
 
-# ----------------------- RL --------------------
-
-# state to state transfer
-initial = qt.basis(2, 0)
-target = (qt.basis(2, 0) + qt.basis(2, 1)).unit()  # |+⟩
-
-H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()]  # control Hamiltonians
-
-w, d, y = 0.1, 1.0, 0.1
-H_d = 1 / 2 * (w * qt.sigmaz() + d * qt.sigmax())  # drift Hamiltonian
-
-H = [H_d] + H_c  # total Hamiltonian
-
-state2state_rl = Case(
-    objectives=[Objective(initial, H, target)],
-    control_parameters={
-        "p": {"bounds": [(-13, 13)]},
-    },
-    tlist=np.linspace(0, 10, 100),
-    algorithm_kwargs={
-        "fid_err_targ": 0.01,
-        "alg": "RL",
-        "max_iter": 20000,
-        "shorter_pulses": True,
-    },
-    optimizer_kwargs={},
-)
-
-# no big difference for unitary evolution
+if _rl_available:
+    # ----------------------- RL --------------------
+
+    # state to state transfer
+    initial = qt.basis(2, 0)
+    target = (qt.basis(2, 0) + qt.basis(2, 1)).unit()  # |+⟩
+
+    H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()]  # control Hamiltonians
+
+    w, d, y = 0.1, 1.0, 0.1
+    H_d = 1 / 2 * (w * qt.sigmaz() + d * qt.sigmax())  # drift Hamiltonian
+
+    H = [H_d] + H_c  # total Hamiltonian
+
+    state2state_rl = Case(
+        objectives=[Objective(initial, H, target)],
+        control_parameters={
+            "p": {"bounds": [(-13, 13)]},
+        },
+        tlist=np.linspace(0, 10, 100),
+        algorithm_kwargs={
+            "fid_err_targ": 0.01,
+            "alg": "RL",
+            "max_iter": 20000,
+            "shorter_pulses": True,
+        },
+        optimizer_kwargs={},
+    )
 
-initial = qt.qeye(2)  # Identity
-target = qt.gates.hadamard_transform()
+    # no big difference for unitary evolution
+
+    initial = qt.qeye(2)  # Identity
+    target = qt.gates.hadamard_transform()
+
+    unitary_rl = state2state_rl._replace(
+        objectives=[Objective(initial, H, target)],
+        control_parameters={
+            "p": {"bounds": [(-13, 13)]},
+        },
+        algorithm_kwargs={
+            "fid_err_targ": 0.01,
+            "alg": "RL",
+            "max_iter": 300,
+            "shorter_pulses": True,
+        },
+    )
 
-unitary_rl = state2state_rl._replace(
-    objectives=[Objective(initial, H, target)],
-    control_parameters={
-        "p": {"bounds": [(-13, 13)]},
-    },
-    algorithm_kwargs={
-        "fid_err_targ": 0.01,
-        "alg": "RL",
-        "max_iter": 300,
-        "shorter_pulses": True,
-    },
-)
+else: # skip RL tests
+    state2state_rl = None
+    unitary_rl = None
 
 
 @pytest.fixture(
@@ -215,6 +237,9 @@ def tst(request):
 
 
 def test_optimize_pulses(tst):
+    if tst is None:
+        pytest.skip("Dependency not available")
+
     result = optimize_pulses(
         tst.objectives,
         tst.control_parameters,

From 2f6c8a347125a089545ced45157d3b6e42d66f1a Mon Sep 17 00:00:00 2001
From: Paul Menczel <paul@menczel.net>
Date: Thu, 13 Feb 2025 14:44:34 +0900
Subject: [PATCH 35/52] Fix minor bug

---
 src/qutip_qoc/result.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/qutip_qoc/result.py b/src/qutip_qoc/result.py
index ef74941..be63877 100644
--- a/src/qutip_qoc/result.py
+++ b/src/qutip_qoc/result.py
@@ -11,6 +11,7 @@
 import qutip as qt
 
 try:
+    import jax
     import jaxlib
     _jitfun_type = jaxlib.xla_extension.PjitFunction
 except ImportError:

From 8cc4e0913a587b52fc6182be0499240fc4f4ffc8 Mon Sep 17 00:00:00 2001
From: Paul Menczel <paul@menczel.net>
Date: Thu, 13 Feb 2025 14:55:47 +0900
Subject: [PATCH 36/52] Update versions of action tools

---
 .github/workflows/build.yml               | 4 ++--
 .github/workflows/build_documentation.yml | 4 ++--
 .github/workflows/test.yml                | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3db5b94..bc8c88c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,7 +33,7 @@ jobs:
       - name: Build a binary wheel and a source tarball
         run: python3 -m cibuildwheel --output-dir dist
       - name: Store the distribution packages
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: python-package-distributions
           path: dist/
@@ -52,7 +52,7 @@ jobs:
 
     steps:
       - name: Download all the dists
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: python-package-distributions
           path: dist/
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index f2c45e2..dd1950f 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - uses: actions/setup-python@v4
         name: Install Python
@@ -42,7 +42,7 @@ jobs:
           #   -T : display a full traceback if a Python exception occurs
 
       - name: Upload built files
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: qutip_qoc_html_docs
           path: doc/_build/html/*
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 69e1b03..14185e3 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,7 +47,7 @@ jobs:
             python-version: "3.12"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
@@ -92,7 +92,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up Python
         uses: actions/setup-python@v4
         with:

From b0fd00c747b1997886d5350afab32d1d9f83ed29 Mon Sep 17 00:00:00 2001
From: Rochisha Agarwal <rochisha.agarwal2302@gmail.com>
Date: Wed, 26 Feb 2025 08:09:30 +0530
Subject: [PATCH 37/52] add check for operator input

---
 src/qutip_qoc/pulse_optim.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 94d424c..349cebf 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -11,6 +11,8 @@
 from qutip_qoc._optimizer import _global_local_optimization
 from qutip_qoc._time import _TimeInterval
 
+import qutip as qt
+
 try:
     from qutip_qoc._rl import _RL
     _rl_available = True
@@ -191,6 +193,11 @@ def optimize_pulses(
     # prepare qtrl optimizers
     qtrl_optimizers = []
     if alg == "CRAB" or alg == "GRAPE":
+        dyn_type = "GEN_MAT"
+        for objective in objectives:
+            if any(qt.isoper(H_i) for H_i in (objective.H if isinstance(objective.H, list) else [objective.H])):
+                dyn_type = "UNIT"
+
         if alg == "GRAPE":  # algorithm specific kwargs
             use_as_amps = True
             minimizer_kwargs.setdefault("method", "L-BFGS-B")  # gradient
@@ -247,7 +254,7 @@ def optimize_pulses(
                 "accuracy_factor": None,  # deprecated
                 "alg_params": alg_params,
                 "optim_params": algorithm_kwargs.get("optim_params", None),
-                "dyn_type": algorithm_kwargs.get("dyn_type", "GEN_MAT"),
+                "dyn_type": algorithm_kwargs.get("dyn_type", dyn_type),
                 "dyn_params": algorithm_kwargs.get("dyn_params", None),
                 "prop_type": algorithm_kwargs.get(
                     "prop_type", "DEF"

From 37de33fef4d5d2bb56f1126f8a319b31187c45a1 Mon Sep 17 00:00:00 2001
From: Rochisha Agarwal <rochisha.agarwal2302@gmail.com>
Date: Mon, 10 Mar 2025 09:44:33 +0530
Subject: [PATCH 38/52] Fix objective for open system

---
 src/qutip_qoc/objective.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/qutip_qoc/objective.py b/src/qutip_qoc/objective.py
index d91caac..06b8294 100644
--- a/src/qutip_qoc/objective.py
+++ b/src/qutip_qoc/objective.py
@@ -83,8 +83,14 @@ def __init__(self, initial, H, target, weight=1):
         # Check if any Hamiltonian in H is a superoperator
         if any(qt.issuper(H_i) for H_i in (H if isinstance(H, list) else [H])):
             # Convert initial and target accordingly
-            self.initial = qt.to_super(self.initial)
-            self.target = qt.to_super(self.target)
+            if qt.isket(self.initial):
+                self.initial = qt.operator_to_vector(qt.ket2dm(self.initial))
+            elif qt.isoper(self.initial):
+                self.initial = qt.operator_to_vector(self.initial)
+            if qt.isket(self.target):
+                self.target = qt.operator_to_vector(qt.ket2dm(self.target))
+            elif qt.isoper(self.target):
+                self.target = qt.operator_to_vector(self.target)
 
     def __getstate__(self):
         """

From e2781bcd1f02da591eee876e7334ddfa5bb698b8 Mon Sep 17 00:00:00 2001
From: Rochisha Agarwal <rochisha.agarwal2302@gmail.com>
Date: Wed, 12 Mar 2025 12:18:30 +0530
Subject: [PATCH 39/52] add logic for gate synthesis

---
 src/qutip_qoc/objective.py   | 12 ------------
 src/qutip_qoc/pulse_optim.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/qutip_qoc/objective.py b/src/qutip_qoc/objective.py
index 06b8294..986961a 100644
--- a/src/qutip_qoc/objective.py
+++ b/src/qutip_qoc/objective.py
@@ -80,18 +80,6 @@ def __init__(self, initial, H, target, weight=1):
         self.target = target
         self.weight = weight
 
-        # Check if any Hamiltonian in H is a superoperator
-        if any(qt.issuper(H_i) for H_i in (H if isinstance(H, list) else [H])):
-            # Convert initial and target accordingly
-            if qt.isket(self.initial):
-                self.initial = qt.operator_to_vector(qt.ket2dm(self.initial))
-            elif qt.isoper(self.initial):
-                self.initial = qt.operator_to_vector(self.initial)
-            if qt.isket(self.target):
-                self.target = qt.operator_to_vector(qt.ket2dm(self.target))
-            elif qt.isoper(self.target):
-                self.target = qt.operator_to_vector(self.target)
-
     def __getstate__(self):
         """
         Extract picklable information from the objective.
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 349cebf..3f3377d 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -30,6 +30,7 @@ def optimize_pulses(
     optimizer_kwargs=None,
     minimizer_kwargs=None,
     integrator_kwargs=None,
+    optimization_type=None,
 ):
     """
     Run GOAT, JOPT, GRAPE, CRAB or RL optimization.
@@ -126,6 +127,9 @@ def optimize_pulses(
         Options for the solver, see :obj:`MESolver.options` and
         `Integrator <./classes.html#classes-ode>`_ for a list of all options.
 
+    optimization_type : str, optional
+        Type of optimization: "state_transfer", "gate_synthesis", or None.
+
     Returns
     -------
     result : :class:`qutip_qoc.Result`
@@ -189,6 +193,31 @@ def optimize_pulses(
             "maxiter": algorithm_kwargs.get("max_iter", 1000),
             "gtol": algorithm_kwargs.get("min_grad", 0.0 if alg == "CRAB" else 1e-8),
         }
+    # Iterate over objectives and convert initial and target states based on the optimization type
+    for objective in objectives:
+        if any(qt.issuper(H_i) for H_i in (objective.H if isinstance(objective.H, list) else [objective.H])):
+            if optimization_type == "state_transfer":
+                if qt.isket(objective.initial):
+                    objective.initial = qt.operator_to_vector(qt.ket2dm(objective.initial))
+                elif qt.isoper(objective.initial):
+                    objective.initial = qt.operator_to_vector(objective.initial)
+                if qt.isket(objective.target):
+                    objective.target = qt.operator_to_vector(qt.ket2dm(objective.target))
+                elif qt.isoper(objective.target):
+                    objective.target = qt.operator_to_vector(objective.target)
+            elif optimization_type == "gate_synthesis":
+                objective.initial = qt.to_super(objective.initial)
+                objective.target = qt.to_super(objective.target)
+            elif optimization_type is None:
+                if qt.isoper(objective.initial) and qt.isoper(objective.target):
+                    if np.isclose(qt.tr(objective.initial), 1) and np.isclose(qt.tr(objective.target), 1):
+                        objective.initial = qt.operator_to_vector(objective.initial)
+                        objective.target = qt.operator_to_vector(objective.target)
+                    else:
+                        objective.initial = qt.to_super(objective.initial)
+                        objective.target = qt.to_super(objective.target)
+                if qt.isket(objective.initial):
+                    objective.initial = qt.operator_to_vector(qt.ket2dm(objective.initial))
 
     # prepare qtrl optimizers
     qtrl_optimizers = []

From aec775f8d970f4dc91ff9d9dc3c2abc58469a5b3 Mon Sep 17 00:00:00 2001
From: jyotiraj-code <jyotirajnath7@gmail.com>
Date: Sat, 15 Mar 2025 01:44:34 +0530
Subject: [PATCH 40/52] Fixing README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8355177..b42d479 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ They can be included by using the targets `qutip-qoc[jopt]` and `qutip-qoc[rl]`,
 
 ## Documentation and tutorials
 
-The documentation of `qutip-qoc` updated to the latest development version is hosted at [qutip-qoc.readthedocs.io](https://qutip-qoc.readthedocs.io/en/latest/).
+The documentation of `qutip-qoc` updated to the latest development version is hosted at [qutip-qoc.readthedocs.io](https://qutip-qoc.readthedocs.io/latest/).
 Tutorials related to using quantum optimal control in `qutip-qoc` can be found [_here_](https://qutip.org/qutip-tutorials/#optimal-control).
 
 ## Installation from source
@@ -43,7 +43,7 @@ pip install --upgrade pip
 pip install -e .
 ```
 
-which makes sure that you are up to date with the latest `pip` version. Contribution guidelines are available [_here_](https://qutip-qoc.readthedocs.io/en/latest/contribution-code.html).
+which makes sure that you are up to date with the latest `pip` version. Contribution guidelines are available [_here_](https://qutip-qoc.readthedocs.io/latest/contribution/code.html).
 
 To build and test the documentation, additional packages need to be installed:
 

From 02d361a0e0efb704eb2980f89f17048131b16a25 Mon Sep 17 00:00:00 2001
From: Rochisha Agarwal <rochisha.agarwal2302@gmail.com>
Date: Tue, 18 Mar 2025 10:07:12 +0530
Subject: [PATCH 41/52] improve the clarity of the code

---
 src/qutip_qoc/pulse_optim.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 3f3377d..faa0e09 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -128,7 +128,9 @@ def optimize_pulses(
         `Integrator <./classes.html#classes-ode>`_ for a list of all options.
 
     optimization_type : str, optional
-        Type of optimization: "state_transfer", "gate_synthesis", or None.
+        Type of optimization. By default, QuTiP-QOC will try to automatically determine 
+        whether this is a *state transfer* or a *gate synthesis* problem. Set this 
+        flag to ``"state_transfer"`` or ``"gate_synthesis"`` to set the mode manually.
 
     Returns
     -------
@@ -195,8 +197,9 @@ def optimize_pulses(
         }
     # Iterate over objectives and convert initial and target states based on the optimization type
     for objective in objectives:
-        if any(qt.issuper(H_i) for H_i in (objective.H if isinstance(objective.H, list) else [objective.H])):
-            if optimization_type == "state_transfer":
+        H_list = objective.H if isinstance(objective.H, list) else [objective.H]
+        if any(qt.issuper(H_i) for H_i in H_list):
+            if isinstance(optimization_type, str) and optimization_type.lower() == "state_transfer":
                 if qt.isket(objective.initial):
                     objective.initial = qt.operator_to_vector(qt.ket2dm(objective.initial))
                 elif qt.isoper(objective.initial):
@@ -205,7 +208,7 @@ def optimize_pulses(
                     objective.target = qt.operator_to_vector(qt.ket2dm(objective.target))
                 elif qt.isoper(objective.target):
                     objective.target = qt.operator_to_vector(objective.target)
-            elif optimization_type == "gate_synthesis":
+            elif isinstance(optimization_type, str) and optimization_type.lower() == "gate_synthesis":
                 objective.initial = qt.to_super(objective.initial)
                 objective.target = qt.to_super(objective.target)
             elif optimization_type is None:
@@ -218,6 +221,8 @@ def optimize_pulses(
                         objective.target = qt.to_super(objective.target)
                 if qt.isket(objective.initial):
                     objective.initial = qt.operator_to_vector(qt.ket2dm(objective.initial))
+                if qt.isket(objective.target):
+                    objective.target = qt.operator_to_vector(qt.ket2dm(objective.target))
 
     # prepare qtrl optimizers
     qtrl_optimizers = []

From 029d85431d2bfc755d0655e58314f077c9b38c55 Mon Sep 17 00:00:00 2001
From: jyotiraj-code <jyotirajnath7@gmail.com>
Date: Tue, 18 Mar 2025 19:41:31 +0530
Subject: [PATCH 42/52] Updated link QuTiP contribution link

---
 doc/contribution/code.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/contribution/code.rst b/doc/contribution/code.rst
index 8a961b8..6cf0705 100644
--- a/doc/contribution/code.rst
+++ b/doc/contribution/code.rst
@@ -7,7 +7,7 @@ Contributing to the source code
 Build up an development environment
 ===================================
 
-Please follow the instruction on the `QuTiP contribution guide <https://qutip.org/docs/latest/development/contributing.html#building>`_ to
+Please follow the instruction on the `QuTiP contribution guide <https://qutip.readthedocs.io/en/latest/development/contributing.html>`_ to
 build a conda environment.
 
 You don't need to build ``qutip`` in the editable mode unless you also want to contribute to `qutip`.

From ae447b655b456e259c4f51fd99fe03b3fab3da2c Mon Sep 17 00:00:00 2001
From: Rochisha Agarwal <rochisha.agarwal2302@gmail.com>
Date: Fri, 28 Mar 2025 00:13:18 +0530
Subject: [PATCH 43/52] fix trace

---
 src/qutip_qoc/pulse_optim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index faa0e09..935bf18 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -213,7 +213,7 @@ def optimize_pulses(
                 objective.target = qt.to_super(objective.target)
             elif optimization_type is None:
                 if qt.isoper(objective.initial) and qt.isoper(objective.target):
-                    if np.isclose(qt.tr(objective.initial), 1) and np.isclose(qt.tr(objective.target), 1):
+                    if np.isclose((objective.initial).tr(), 1) and np.isclose((objective.target).tr(), 1):
                         objective.initial = qt.operator_to_vector(objective.initial)
                         objective.target = qt.operator_to_vector(objective.target)
                     else:

From 5224a2136cd1fba66e94adb54db1c67e927e6b7c Mon Sep 17 00:00:00 2001
From: Akhils777 <akhilpratapsingh777@gmail.com>
Date: Tue, 6 May 2025 03:38:53 +0530
Subject: [PATCH 44/52] documentation  fix

---
 doc/conf.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/conf.py b/doc/conf.py
index ab65dc7..96ca014 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -62,6 +62,10 @@ def qutip_qoc_version():
 
 html_theme = "sphinx_rtd_theme"
 html_static_path = []
+html_js_files = [
+    'https://code.jquery.com/jquery-3.6.0.min.js',
+]
+
 
 # -- Options for numpydoc ---------------------------------------
 

From 9a1875c107ef89295420ede931a8726cc688d879 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 15 May 2025 20:12:30 +0000
Subject: [PATCH 45/52] Bump idna from 2.10 to 3.7 in /doc in the pip group
 across 1 directory

Bumps the pip group with 1 update in the /doc directory: [idna](https://github.com/kjd/idna).


Updates `idna` from 2.10 to 3.7
- [Release notes](https://github.com/kjd/idna/releases)
- [Changelog](https://github.com/kjd/idna/blob/master/HISTORY.rst)
- [Commits](https://github.com/kjd/idna/compare/v2.10...v3.7)

---
updated-dependencies:
- dependency-name: idna
  dependency-type: direct:production
  dependency-group: pip
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 doc/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/requirements.txt b/doc/requirements.txt
index 1e31021..d03fe5a 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -12,7 +12,7 @@ Babel==2.9.1
 certifi==2024.7.4
 chardet==4.0.0
 colorama==0.4.4
-idna==2.10
+idna==3.7
 imagesize==1.4.1
 Jinja2==3.0.1
 MarkupSafe==2.0.1

From 5402fb0dc47efa206e25b5ac09e90c4511e451a7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 15 May 2025 21:47:33 +0000
Subject: [PATCH 46/52] Bump jinja2 in /doc in the pip group across 1 directory

Bumps the pip group with 1 update in the /doc directory: [jinja2](https://github.com/pallets/jinja).


Updates `jinja2` from 3.0.1 to 3.1.6
- [Release notes](https://github.com/pallets/jinja/releases)
- [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst)
- [Commits](https://github.com/pallets/jinja/compare/3.0.1...3.1.6)

---
updated-dependencies:
- dependency-name: jinja2
  dependency-version: 3.1.6
  dependency-type: direct:production
  dependency-group: pip
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 doc/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/requirements.txt b/doc/requirements.txt
index d03fe5a..3bc7ba6 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -14,7 +14,7 @@ chardet==4.0.0
 colorama==0.4.4
 idna==3.7
 imagesize==1.4.1
-Jinja2==3.0.1
+Jinja2==3.1.6
 MarkupSafe==2.0.1
 packaging==23.2
 Pygments==2.17.2

From d5ff028da6c2611f9d104eb5bb35f326aa64739a Mon Sep 17 00:00:00 2001
From: Akhil Pratap Singh <118816443+Akhils777@users.noreply.github.com>
Date: Thu, 3 Jul 2025 02:54:02 +0530
Subject: [PATCH 47/52] Fix: Enable JOPT to support open-system optimization
 with TRACEDIFF fidelity (#49)

---
 src/qutip_qoc/_jopt.py             |  6 ++---
 src/qutip_qoc/result.py            |  2 +-
 tests/test_jopt_open_system_bug.py | 39 ++++++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_jopt_open_system_bug.py

diff --git a/src/qutip_qoc/_jopt.py b/src/qutip_qoc/_jopt.py
index 6274076..6c9cfe2 100644
--- a/src/qutip_qoc/_jopt.py
+++ b/src/qutip_qoc/_jopt.py
@@ -150,8 +150,8 @@ def _infid(self, params):
         if self._fid_type == "TRACEDIFF":
             diff = X - self._target
             # to prevent if/else in qobj.dag() and qobj.tr()
-            diff_dag = Qobj(diff.data.adjoint(), dims=diff.dims)
-            g = 1 / 2 * (diff_dag * diff).data.trace()
+            diff_dag = diff.dag() # direct access to JAX array, no fallback!
+            g = 1 / 2 * jnp.trace(diff_dag.data._jxa @ diff.data._jxa)
             infid = jnp.real(self._norm_fac * g)
         else:
             g = self._norm_fac * self._target.overlap(X)
@@ -160,4 +160,4 @@ def _infid(self, params):
             elif self._fid_type == "SU":  # f_SU (incl global phase)
                 infid = 1 - jnp.real(g)
 
-        return infid
+        return infid
\ No newline at end of file
diff --git a/src/qutip_qoc/result.py b/src/qutip_qoc/result.py
index be63877..7738aa9 100644
--- a/src/qutip_qoc/result.py
+++ b/src/qutip_qoc/result.py
@@ -13,7 +13,7 @@
 try:
     import jax
     import jaxlib
-    _jitfun_type = jaxlib.xla_extension.PjitFunction
+    _jitfun_type = type(jax.jit(lambda x: x))
 except ImportError:
     _jitfun_type = None
 
diff --git a/tests/test_jopt_open_system_bug.py b/tests/test_jopt_open_system_bug.py
new file mode 100644
index 0000000..a4c9d26
--- /dev/null
+++ b/tests/test_jopt_open_system_bug.py
@@ -0,0 +1,39 @@
+import numpy as np
+import qutip as qt
+from qutip_qoc import Objective, optimize_pulses
+
+from jax import jit, numpy
+
+def test_open_system_jopt_runs_without_error():
+    Hd = qt.Qobj(np.diag([1, 2]))
+    c_ops = [np.sqrt(0.1) * qt.sigmam()]
+    Hc = qt.sigmax()
+
+    Ld = qt.liouvillian(H=Hd, c_ops=c_ops)
+    Lc = qt.liouvillian(Hc)
+
+    initial_state = qt.fock_dm(2, 0)
+    target_state = qt.fock_dm(2, 1)
+
+    times = np.linspace(0, 2 * np.pi, 250)
+
+    @jit
+    def sin_x(t, c, **kwargs):
+        return c[0] * numpy.sin(c[1] * t)
+    L = [Ld, [Lc, sin_x]]
+
+    guess_params = [1, 0.5]
+
+    res_jopt = optimize_pulses(
+        objectives = Objective(initial_state, L, target_state),
+        control_parameters = {
+            "ctrl_x": {"guess": guess_params, "bounds": [(-1, 1), (0, 2 * np.pi)]}
+        },
+        tlist = times,
+        algorithm_kwargs = {
+            "alg": "JOPT",
+            "fid_err_targ": 0.001,
+        },
+    )
+
+    assert res_jopt.infidelity < 0.25, f"Fidelity error too high: {res_jopt.infidelity}"
\ No newline at end of file

From ee0381f7bb8a96df7a876617985b2ccd8754ff75 Mon Sep 17 00:00:00 2001
From: Rochisha Agarwal <45593458+rochisha0@users.noreply.github.com>
Date: Fri, 15 Aug 2025 21:45:36 +0900
Subject: [PATCH 48/52] Match grape infidelity with manually computed one (#51)

* fix dim calc susch that grape infid computed correctly
---
 src/qutip_qoc/pulse_optim.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 935bf18..fc44231 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -195,34 +195,55 @@ def optimize_pulses(
             "maxiter": algorithm_kwargs.get("max_iter", 1000),
             "gtol": algorithm_kwargs.get("min_grad", 0.0 if alg == "CRAB" else 1e-8),
         }
+
     # Iterate over objectives and convert initial and target states based on the optimization type
     for objective in objectives:
         H_list = objective.H if isinstance(objective.H, list) else [objective.H]
         if any(qt.issuper(H_i) for H_i in H_list):
             if isinstance(optimization_type, str) and optimization_type.lower() == "state_transfer":
                 if qt.isket(objective.initial):
+                    dim = objective.initial.shape[0]
                     objective.initial = qt.operator_to_vector(qt.ket2dm(objective.initial))
                 elif qt.isoper(objective.initial):
+                    dim = objective.initial.shape[0]
                     objective.initial = qt.operator_to_vector(objective.initial)
+
                 if qt.isket(objective.target):
                     objective.target = qt.operator_to_vector(qt.ket2dm(objective.target))
                 elif qt.isoper(objective.target):
                     objective.target = qt.operator_to_vector(objective.target)
+
+                algorithm_kwargs.setdefault("fid_params", {})
+                algorithm_kwargs["fid_params"].setdefault("scale_factor", 1.0 / dim)
+
             elif isinstance(optimization_type, str) and optimization_type.lower() == "gate_synthesis":
                 objective.initial = qt.to_super(objective.initial)
                 objective.target = qt.to_super(objective.target)
+
             elif optimization_type is None:
+                is_state_transfer = False
                 if qt.isoper(objective.initial) and qt.isoper(objective.target):
-                    if np.isclose((objective.initial).tr(), 1) and np.isclose((objective.target).tr(), 1):
+                    if np.isclose(objective.initial.tr(), 1) and np.isclose(objective.target.tr(), 1):
+                        dim = objective.initial.shape[0]
                         objective.initial = qt.operator_to_vector(objective.initial)
                         objective.target = qt.operator_to_vector(objective.target)
+                        is_state_transfer = True
                     else:
                         objective.initial = qt.to_super(objective.initial)
                         objective.target = qt.to_super(objective.target)
+
                 if qt.isket(objective.initial):
+                    dim = objective.initial.shape[0]
                     objective.initial = qt.operator_to_vector(qt.ket2dm(objective.initial))
+                    is_state_transfer = True
+
                 if qt.isket(objective.target):
                     objective.target = qt.operator_to_vector(qt.ket2dm(objective.target))
+                    is_state_transfer = True
+
+                if is_state_transfer:
+                    algorithm_kwargs.setdefault("fid_params", {})
+                    algorithm_kwargs["fid_params"].setdefault("scale_factor", 1.0 / dim)
 
     # prepare qtrl optimizers
     qtrl_optimizers = []

From 7fe9ce641123412da30fcf3c61b5e28ef89e2e5d Mon Sep 17 00:00:00 2001
From: SeitherJulian <julian.seither@gmx.de>
Date: Fri, 15 Aug 2025 21:48:11 +0900
Subject: [PATCH 49/52] Add interactive test notebooks for closed systems (#43)

* Added interactive test notebooks for closed systems

* Added interactive tests to automatic test suite

* Make sure required packages are installed in test environment

* Finalized gate synthesis notebooks

* Added about file for interactive notebooks

---------

Co-authored-by: Paul Menczel <paul@menczel.net>
---
 .github/workflows/test.yml              |   2 +-
 tests/interactive/CRAB_gate_closed.md   | 102 ++++++++
 tests/interactive/CRAB_state_closed.md  |  95 ++++++++
 tests/interactive/GOAT_gate_closed.md   | 280 ++++++++++++++++++++++
 tests/interactive/GOAT_state_closed.md  | 292 +++++++++++++++++++++++
 tests/interactive/GRAPE_gate_closed.md  | 124 ++++++++++
 tests/interactive/GRAPE_state_closed.md | 114 +++++++++
 tests/interactive/JOPT_gate_closed.md   | 285 +++++++++++++++++++++++
 tests/interactive/JOPT_state_closed.md  | 296 ++++++++++++++++++++++++
 tests/interactive/about.md              |  21 ++
 tests/test_interactive.py               |  32 +++
 11 files changed, 1642 insertions(+), 1 deletion(-)
 create mode 100644 tests/interactive/CRAB_gate_closed.md
 create mode 100644 tests/interactive/CRAB_state_closed.md
 create mode 100644 tests/interactive/GOAT_gate_closed.md
 create mode 100644 tests/interactive/GOAT_state_closed.md
 create mode 100644 tests/interactive/GRAPE_gate_closed.md
 create mode 100644 tests/interactive/GRAPE_state_closed.md
 create mode 100644 tests/interactive/JOPT_gate_closed.md
 create mode 100644 tests/interactive/JOPT_state_closed.md
 create mode 100644 tests/interactive/about.md
 create mode 100644 tests/test_interactive.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 14185e3..b0272a6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -79,7 +79,7 @@ jobs:
 
       - name: Test with pytest and generate coverage report
         run: |
-          pip install pytest-cov coveralls
+          pip install jupytext nbconvert ipykernel pytest-cov coveralls
           pytest tests --strict-markers --cov=qutip_qoc --cov-report= --color=yes
 
       - name: Upload to Coveralls
diff --git a/tests/interactive/CRAB_gate_closed.md b/tests/interactive/CRAB_gate_closed.md
new file mode 100644
index 0000000..ea36556
--- /dev/null
+++ b/tests/interactive/CRAB_gate_closed.md
@@ -0,0 +1,102 @@
+---
+jupyter:
+  jupytext:
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.3'
+      jupytext_version: 1.17.1
+  kernelspec:
+    display_name: Python 3 (ipykernel)
+    language: python
+    name: python3
+---
+
+# CRAB algorithm for a closed system
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+from qutip import gates, qeye, sigmax, sigmay, sigmaz
+import qutip as qt
+from qutip_qoc import Objective, optimize_pulses
+
+def fidelity(gate, target_gate):
+    """
+    Fidelity used for unitary gates in qutip-qtrl and qutip-qoc
+    """
+    return np.abs(gate.overlap(target_gate) / target_gate.norm())
+```
+
+## Problem setup
+
+```python
+omega = 0.1  # energy splitting
+sx, sy, sz = sigmax(), sigmay(), sigmaz()
+
+Hd = 1 / 2 * omega * sz
+Hc = [sx, sy, sz]
+H = [Hd, Hc[0], Hc[1], Hc[2]]
+
+# objective for optimization
+initial_gate = qeye(2)
+target_gate = gates.hadamard_transform()
+
+times = np.linspace(0, np.pi / 2, 250)
+```
+
+## CRAB algorithm
+
+```python
+n_params = 3  # adjust in steps of 3
+control_params = {
+    "ctrl_x": {"guess": [1 for _ in range(n_params)], "bounds": [(-1, 1)] * n_params},
+    "ctrl_y": {"guess": [1 for _ in range(n_params)], "bounds": [(-1, 1)] * n_params},
+    "ctrl_z": {"guess": [1 for _ in range(n_params)], "bounds": [(-1, 1)] * n_params},
+}
+
+res_crab = optimize_pulses(
+    objectives = Objective(initial_gate, H, target_gate),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "CRAB",
+        "fid_err_targ": 0.001
+    },
+)
+
+print('Infidelity: ', res_crab.infidelity)
+
+plt.plot(times, res_crab.optimized_controls[0], 'b', label='optimized pulse sx')
+plt.plot(times, res_crab.optimized_controls[1], 'g', label='optimized pulse sy')
+plt.plot(times, res_crab.optimized_controls[2], 'r', label='optimized pulse sz')
+plt.title('CRAB pulses')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+H_result = [Hd, [Hc[0], res_crab.optimized_controls[0]], [Hc[1], res_crab.optimized_controls[1]], [Hc[2], res_crab.optimized_controls[2]]]
+evolution = qt.sesolve(H_result, initial_gate, times)
+
+plt.plot(times, [fidelity(gate, initial_gate) for gate in evolution.states], label="Overlap with initial gate")
+plt.plot(times, [fidelity(gate, target_gate) for gate in evolution.states], label="Overlap with target gate")
+
+plt.title('CRAB performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Validation
+
+```python
+assert res_crab.infidelity < 0.001
+assert fidelity(evolution.states[-1], target_gate) > 1-0.001
+```
+
+```python
+qt.about()
+```
diff --git a/tests/interactive/CRAB_state_closed.md b/tests/interactive/CRAB_state_closed.md
new file mode 100644
index 0000000..278ab21
--- /dev/null
+++ b/tests/interactive/CRAB_state_closed.md
@@ -0,0 +1,95 @@
+---
+jupyter:
+  jupytext:
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.3'
+      jupytext_version: 1.17.1
+  kernelspec:
+    display_name: Python 3 (ipykernel)
+    language: python
+    name: python3
+---
+
+# CRAB algorithm for 2 level system
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+from qutip import basis, Qobj
+import qutip as qt
+from qutip_qoc import Objective, optimize_pulses
+```
+
+## Problem setup
+
+```python
+# Energy levels
+E1, E2 = 1.0, 2.0
+
+Hd = Qobj(np.diag([E1, E2]))
+Hc = Qobj(np.array([
+    [0, 1],
+    [1, 0]
+]))
+H = [Hd, Hc]
+
+initial_state = basis(2, 0)  # |1>
+target_state = basis(2, 1)   # |2>
+
+times = np.linspace(0, 2 * np.pi, 250)
+```
+
+## CRAB algorithm
+
+```python
+n_params = 6 # adjust in steps of 3
+control_params = {
+    "ctrl_x": {"guess": [1 for _ in range(n_params)], "bounds": [(-1, 1)] * n_params},
+}
+
+res_crab = optimize_pulses(
+    objectives = Objective(initial_state, H, target_state),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "CRAB",
+        "fid_err_targ": 0.001
+    },
+)
+
+print('Infidelity: ', res_crab.infidelity)
+
+plt.plot(times, res_crab.optimized_controls[0], label='optimized pulse')
+plt.title('CRAB pulse')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+H_result = [Hd, [Hc, np.array(res_crab.optimized_controls[0])]]
+evolution = qt.sesolve(H_result, initial_state, times)
+
+plt.plot(times, [np.abs(state.overlap(initial_state)) for state in evolution.states], label="Overlap with initial state")
+plt.plot(times, [np.abs(state.overlap(target_state)) for state in evolution.states], label="Overlap with target state")
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution.states], '--', label="Fidelity")
+
+plt.title("CRAB performance")
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Validation
+
+```python
+assert res_crab.infidelity < 0.001
+assert np.abs(evolution.states[-1].overlap(target_state)) > 1-0.001
+```
+
+```python
+qt.about()
+```
diff --git a/tests/interactive/GOAT_gate_closed.md b/tests/interactive/GOAT_gate_closed.md
new file mode 100644
index 0000000..4eafa97
--- /dev/null
+++ b/tests/interactive/GOAT_gate_closed.md
@@ -0,0 +1,280 @@
+---
+jupyter:
+  jupytext:
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.3'
+      jupytext_version: 1.17.1
+  kernelspec:
+    display_name: Python 3 (ipykernel)
+    language: python
+    name: python3
+---
+
+# GOAT algorithm for a closed system
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+from qutip import gates, qeye, sigmax, sigmay, sigmaz
+import qutip as qt
+from qutip_qoc import Objective, optimize_pulses
+
+def fidelity(gate, target_gate):
+    """
+    Fidelity used for unitary gates in qutip-qtrl and qutip-qoc
+    """
+    return np.abs(gate.overlap(target_gate) / target_gate.norm())
+```
+
+## Problem setup
+
+```python
+omega = 0.1  # energy splitting
+sx, sy, sz = sigmax(), sigmay(), sigmaz()
+
+Hd = 1 / 2 * omega * sz
+Hc = [sx, sy, sz]
+
+# objective for optimization
+initial_gate = qeye(2)
+target_gate = gates.hadamard_transform()
+
+times = np.linspace(0, np.pi / 2, 250)
+```
+
+## Guess
+
+```python
+goat_guess = [1, 1]
+guess_pulse = goat_guess[0] * np.sin(goat_guess[1] * times)
+
+H_guess = [Hd] + [[hc, guess_pulse] for hc in Hc]
+evolution_guess = qt.sesolve(H_guess, initial_gate, times)
+
+print('Fidelity: ', fidelity(evolution_guess.states[-1], target_gate))
+
+plt.plot(times, [fidelity(gate, initial_gate) for gate in evolution_guess.states], label="Overlap with initial gate")
+plt.plot(times, [fidelity(gate, target_gate) for gate in evolution_guess.states], label="Overlap with target gate")
+plt.title("Guess performance")
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## GOAT algorithm
+
+```python
+# control function
+def sin(t, c):
+    return c[0] * np.sin(c[1] * t)
+
+# derivatives
+def grad_sin(t, c, idx):
+    if idx == 0:  # w.r.t. c0
+        return np.sin(c[1] * t)
+    if idx == 1:  # w.r.t. c1
+        return c[0] * np.cos(c[1] * t) * t
+    if idx == 2:  # w.r.t. time
+        return c[0] * np.cos(c[1] * t) * c[1]
+    
+H = [Hd] + [[hc, sin, {"grad": grad_sin}] for hc in Hc]
+```
+
+### a) not optimized over time
+
+```python
+control_params = {
+    id: {"guess": goat_guess, "bounds": [(-1, 1), (0, 2 * np.pi)]}  # c0 and c1
+    for id in ['x', 'y', 'z']
+}
+
+# run the optimization
+res_goat = optimize_pulses(
+    objectives = Objective(initial_gate, H, target_gate),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "GOAT",
+        "fid_err_targ": 0.001,
+    },
+)
+
+print('Infidelity: ', res_goat.infidelity)
+
+plt.plot(times, guess_pulse, 'k--', label='guess pulse sx, sy, sz')
+plt.plot(times, res_goat.optimized_controls[0], 'b', label='optimized pulse sx')
+plt.plot(times, res_goat.optimized_controls[1], 'g', label='optimized pulse sy')
+plt.plot(times, res_goat.optimized_controls[2], 'r', label='optimized pulse sz')
+plt.title('GOAT pulses')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+H_result = [Hd,
+            [Hc[0], np.array(res_goat.optimized_controls[0])],
+            [Hc[1], np.array(res_goat.optimized_controls[1])],
+            [Hc[2], np.array(res_goat.optimized_controls[2])]]
+evolution = qt.sesolve(H_result, initial_gate, times)
+
+plt.plot(times, [fidelity(gate, initial_gate) for gate in evolution.states], label="Overlap with initial gate")
+plt.plot(times, [fidelity(gate, target_gate) for gate in evolution.states], label="Overlap with target gate")
+
+plt.title('GOAT performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+### b) optimized over time
+
+```python
+# treats time as optimization variable
+control_params["__time__"] = {
+    "guess": times[len(times) // 2],
+    "bounds": [times[0], times[-1]],
+}
+
+# run the optimization
+res_goat_time = optimize_pulses(
+    objectives = Objective(initial_gate, H, target_gate),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "GOAT",
+        "fid_err_targ": 0.001,
+    },
+)
+
+opt_time = res_goat_time.optimized_params[-1][0]
+time_range = times < opt_time
+
+print('Infidelity: ', res_goat_time.infidelity)
+print('Optimized time: ', opt_time)
+
+plt.plot(times, guess_pulse, 'k--', label='guess pulse sx, sy, sz')
+plt.plot(times[time_range], np.array(res_goat_time.optimized_controls[0])[time_range], 'b', label='optimized pulse sx')
+plt.plot(times[time_range], np.array(res_goat_time.optimized_controls[1])[time_range], 'g', label='optimized pulse sy')
+plt.plot(times[time_range], np.array(res_goat_time.optimized_controls[2])[time_range], 'r', label='optimized pulse sz')
+plt.title('GOAT pulses (time optimization)')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+times2 = times[time_range]
+if opt_time not in times2:
+    times2 = np.append(times2, opt_time)
+
+H_result = qt.QobjEvo(
+    [Hd, [Hc[0], np.array(res_goat_time.optimized_controls[0])],
+         [Hc[1], np.array(res_goat_time.optimized_controls[1])], 
+         [Hc[2], np.array(res_goat_time.optimized_controls[2])]], tlist=times)
+evolution_time = qt.sesolve(H_result, initial_gate, times2)
+
+plt.plot(times2, [fidelity(gate, initial_gate) for gate in evolution_time.states], label="Overlap with initial gate")
+plt.plot(times2, [fidelity(gate, target_gate) for gate in evolution_time.states], label="Overlap with target gate")
+
+plt.title('GOAT (optimized over time) performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Global optimization
+
+```python
+res_goat_global = optimize_pulses(
+    objectives = Objective(initial_gate, H, target_gate),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "GOAT",
+        "fid_err_targ": 0.001,
+    },
+    optimizer_kwargs = {
+       "method": "basinhopping",
+       "max_iter": 100,
+    }
+)
+
+global_time = res_goat_global.optimized_params[-1][0]
+global_range = times < global_time
+
+print('Infidelity: ', res_goat_global.infidelity)
+print('Optimized time: ', global_time)
+
+plt.plot(times, guess_pulse, 'k--', label='guess pulse sx, sy, sz')
+plt.plot(times[global_range], np.array(res_goat_global.optimized_controls[0])[global_range], 'b', label='optimized pulse sx')
+plt.plot(times[global_range], np.array(res_goat_global.optimized_controls[1])[global_range], 'g', label='optimized pulse sy')
+plt.plot(times[global_range], np.array(res_goat_global.optimized_controls[2])[global_range], 'r', label='optimized pulse sz')
+plt.title('GOAT pulses (global optimization)')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+times3 = times[global_range]
+if global_time not in times3:
+    times3 = np.append(times3, global_time)
+
+H_result = qt.QobjEvo(
+    [Hd, [Hc[0], np.array(res_goat_global.optimized_controls[0])],
+         [Hc[1], np.array(res_goat_global.optimized_controls[1])], 
+         [Hc[2], np.array(res_goat_global.optimized_controls[2])]], tlist=times)
+evolution_global = qt.sesolve(H_result, initial_gate, times3)
+
+plt.plot(times3, [fidelity(gate, initial_gate) for gate in evolution_global.states], label="Overlap with initial gate")
+plt.plot(times3, [fidelity(gate, target_gate) for gate in evolution_global.states], label="Overlap with target gate")
+
+plt.title('GOAT (global optimization) performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Comparison
+
+```python
+fig, axes = plt.subplots(1, 3, figsize=(18, 4))  # 1 row, 3 columns
+
+titles = ["GOAT sx pulses", "GOAT sy pulses", "GOAT sz pulses"]
+
+for i, ax in enumerate(axes):
+    ax.plot(times, guess_pulse, label='initial guess')
+    ax.plot(times, res_goat.optimized_controls[i], label='optimized pulse')
+    ax.plot(times[time_range], np.array(res_goat_time.optimized_controls[i])[time_range], label='optimized (over time) pulse')
+    ax.plot(times[global_range], np.array(res_goat_global.optimized_controls[i])[global_range], label='global optimized pulse')
+    ax.set_title(titles[i])
+    ax.set_xlabel('Time')
+    ax.set_ylabel('Pulse amplitude')
+    ax.legend()
+
+plt.tight_layout()
+plt.show()
+```
+
+## Validation
+
+```python
+assert res_goat.infidelity < 0.001
+assert fidelity(evolution.states[-1], target_gate) > 1-0.001
+
+assert res_goat_time.infidelity < 0.001
+assert fidelity(evolution_time.states[-1], target_gate) > 1-0.001
+
+assert res_goat_global.infidelity < 0.001
+assert fidelity(evolution_global.states[-1], target_gate) > 1-0.001
+```
+
+```python
+qt.about()
+```
diff --git a/tests/interactive/GOAT_state_closed.md b/tests/interactive/GOAT_state_closed.md
new file mode 100644
index 0000000..bbce5f2
--- /dev/null
+++ b/tests/interactive/GOAT_state_closed.md
@@ -0,0 +1,292 @@
+---
+jupyter:
+  jupytext:
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.3'
+      jupytext_version: 1.17.1
+  kernelspec:
+    display_name: Python 3 (ipykernel)
+    language: python
+    name: python3
+---
+
+# GOAT algorithm for a 2 level system
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+from qutip import basis, Qobj
+import qutip as qt
+from qutip_qoc import Objective, optimize_pulses
+```
+
+## Problem setup
+
+```python
+# Energy levels
+E1, E2 = 1.0, 2.0
+
+Hd = Qobj(np.diag([E1, E2]))
+Hc = Qobj(np.array([
+    [0, 1],
+    [1, 0]
+]))
+H = [Hd, Hc]
+
+initial_state = basis(2, 0)  # |1>
+target_state = basis(2, 1)   # |2>
+
+times = np.linspace(0, 2 * np.pi, 250)
+```
+
+## Guess
+
+```python
+goat_guess = [1, 0.5]
+guess_pulse = goat_guess[0] * np.sin(goat_guess[1] * times)
+
+H_guess = [Hd, [Hc, guess_pulse]]
+evolution_guess = qt.sesolve(H_guess, initial_state, times)
+
+print('Fidelity: ', qt.fidelity(evolution_guess.states[-1], target_state))
+
+plt.plot(times, [np.abs(state.overlap(initial_state)) for state in evolution_guess.states], label="Overlap with initial state")
+plt.plot(times, [np.abs(state.overlap(target_state)) for state in evolution_guess.states], label="Overlap with target state")
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution_guess.states], '--', label="Fidelity")
+plt.title("Guess performance")
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## GOAT algorithm
+
+```python
+# control function
+def sin(t, c):
+    return c[0] * np.sin(c[1] * t)
+
+# gradient
+def grad_sin(t, c, idx):
+    if idx == 0:  # w.r.t. c0
+        return np.sin(c[1] * t)
+    if idx == 1:  # w.r.t. c1
+        return c[0] * np.cos(c[1] * t) * t
+    if idx == 2:  # w.r.t. time
+        return c[0] * np.cos(c[1] * t) * c[1]
+    
+H = [Hd] + [[Hc, sin, {"grad": grad_sin}]]
+```
+
+### a) not optimized over time
+
+```python
+control_params = {
+    "ctrl_x": {"guess": goat_guess, "bounds": [(-1, 1), (0, 2 * np.pi)]}  # c0 and c1
+}
+
+# run the optimization
+res_goat = optimize_pulses(
+    objectives = Objective(initial_state, H, target_state),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "GOAT",
+        "fid_err_targ": 0.001
+    },
+)
+
+print('Infidelity: ', res_goat.infidelity)
+
+plt.plot(times, guess_pulse, label='initial guess')
+plt.plot(times, res_goat.optimized_controls[0], label='optimized pulse')
+plt.title('GOAT pulses')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+H_result = [Hd, [Hc, np.array(res_goat.optimized_controls[0])]]
+evolution = qt.sesolve(H_result, initial_state, times)
+
+plt.plot(times, [np.abs(state.overlap(initial_state)) for state in evolution.states], label="Overlap with initial state")
+plt.plot(times, [np.abs(state.overlap(target_state)) for state in evolution.states], label="Overlap with target state")
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution.states], '--', label="Fidelity")
+
+plt.title('GOAT performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+Here, GOAT is stuck in a local minimum and does not reach the desired fidelity.
+
+
+### b) optimized over time
+
+```python
+# treats time as optimization variable
+control_params["__time__"] = {
+    "guess": times[len(times) // 2],
+    "bounds": [times[0], times[-1]],
+}
+
+# run the optimization
+res_goat_time = optimize_pulses(
+    objectives = Objective(initial_state, H, target_state),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "GOAT",
+        "fid_err_targ": 0.001
+    },
+)
+
+opt_time = res_goat_time.optimized_params[-1][0]
+time_range = times < opt_time
+
+print('Infidelity: ', res_goat_time.infidelity)
+print('Optimized time: ', opt_time)
+
+plt.plot(times, guess_pulse, label='initial guess')
+plt.plot(times, res_goat.optimized_controls[0], label='optimized pulse')
+plt.plot(times[time_range], np.array(res_goat_time.optimized_controls[0])[time_range], label='optimized (over time) pulse')
+plt.title('GOAT pulses (time optimization)')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+times2 = times[time_range]
+if opt_time not in times2:
+    times2 = np.append(times2, opt_time)
+
+H_result = qt.QobjEvo([Hd, [Hc, np.array(res_goat_time.optimized_controls[0])]], tlist=times)
+evolution_time = qt.sesolve(H_result, initial_state, times2)
+
+plt.plot(times2, [np.abs(state.overlap(initial_state)) for state in evolution_time.states], label="Overlap with initial state")
+plt.plot(times2, [np.abs(state.overlap(target_state)) for state in evolution_time.states], label="Overlap with target state")
+plt.plot(times2, [qt.fidelity(state, target_state) for state in evolution_time.states], '--', label="Fidelity")
+
+plt.title('GOAT (optimized over time) performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+GOAT is still stuck in a local minimum, but the fidelity has improved.
+
+
+### c) global optimization 
+
+```python
+res_goat_global = optimize_pulses(
+    objectives = Objective(initial_state, H, target_state),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "GOAT",
+        "fid_err_targ": 0.001
+    },
+    optimizer_kwargs={
+       "method": "basinhopping",
+       "max_iter": 1000
+    }
+)
+
+global_time = res_goat_global.optimized_params[-1][0]
+global_range = times < global_time
+
+print('Infidelity: ', res_goat_global.infidelity)
+print('Optimized time: ', global_time)
+
+plt.plot(times, guess_pulse, label='initial guess')
+plt.plot(times, res_goat.optimized_controls[0], label='optimized pulse')
+plt.plot(times[global_range], np.array(res_goat_global.optimized_controls[0])[global_range], label='global optimized pulse')
+plt.title('GOAT pulses (global)')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+times3 = times[global_range]
+if global_time not in times3:
+    times3 = np.append(times3, global_time)
+
+H_result = qt.QobjEvo([Hd, [Hc, np.array(res_goat_global.optimized_controls[0])]], tlist=times)
+evolution_global = qt.sesolve(H_result, initial_state, times3)
+
+plt.plot(times3, [np.abs(state.overlap(initial_state)) for state in evolution_global.states], label="Overlap with initial state")
+plt.plot(times3, [np.abs(state.overlap(target_state)) for state in evolution_global.states], label="Overlap with target state")
+plt.plot(times3, [qt.fidelity(state, target_state) for state in evolution_global.states], '--', label="Fidelity")
+
+plt.title('GOAT (global) performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Comparison
+
+```python
+plt.plot(times, guess_pulse, color='blue', label='initial guess')
+plt.plot(times, res_goat.optimized_controls[0], color='orange', label='optimized pulse')
+plt.plot(times[time_range], np.array(res_goat_time.optimized_controls[0])[time_range], 
+         color='green', label='optimized (over time) pulse')
+plt.plot(times[global_range], np.array(res_goat_global.optimized_controls[0])[global_range], 
+         color='red', label='global optimized pulse')
+
+plt.title('GOAT pulses')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+print('Guess Fidelity: ', qt.fidelity(evolution_guess.states[-1], target_state))
+print('GOAT Fidelity: ', 1 - res_goat.infidelity)
+print('Time Fidelity: ', 1 - res_goat_time.infidelity)
+print('GLobal Fidelity: ', 1 - res_goat_global.infidelity)
+
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution_guess.states], color='blue', label="Guess")
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution.states], color='orange', label="GOAT")
+plt.plot(times2, [qt.fidelity(state, target_state) for state in evolution_time.states], 
+         color='green', label="Time")
+plt.plot(times3, [qt.fidelity(state, target_state) for state in evolution_global.states], 
+         color='red', label="Global")
+
+plt.title('Fidelities')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Validation
+
+```python
+guess_fidelity = qt.fidelity(evolution_guess.states[-1], target_state)
+
+# target fidelity not reached in part a), check that it is better than the guess
+assert 1 - res_goat.infidelity > guess_fidelity
+assert np.allclose(np.abs(evolution.states[-1].overlap(target_state)), 1 - res_goat.infidelity, atol=1e-3)
+
+# target fidelity not reached in part b), check that it is better than part a)
+assert res_goat_time.infidelity < res_goat.infidelity
+assert np.allclose(np.abs(evolution_time.states[-1].overlap(target_state)), 1 - res_goat_time.infidelity, atol=1e-3)
+
+assert res_goat_global.infidelity < 0.001
+assert np.abs(evolution_global.states[-1].overlap(target_state)) > 1 - 0.001
+```
+
+```python
+qt.about()
+```
diff --git a/tests/interactive/GRAPE_gate_closed.md b/tests/interactive/GRAPE_gate_closed.md
new file mode 100644
index 0000000..bb9b145
--- /dev/null
+++ b/tests/interactive/GRAPE_gate_closed.md
@@ -0,0 +1,124 @@
+---
+jupyter:
+  jupytext:
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.3'
+      jupytext_version: 1.17.1
+  kernelspec:
+    display_name: Python 3 (ipykernel)
+    language: python
+    name: python3
+---
+
+# GRAPE algorithm for a closed system
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+from qutip import gates, qeye, sigmax, sigmay, sigmaz
+import qutip as qt
+from qutip_qoc import Objective, optimize_pulses
+
+def fidelity(gate, target_gate):
+    """
+    Fidelity used for unitary gates in qutip-qtrl and qutip-qoc
+    """
+    return np.abs(gate.overlap(target_gate) / target_gate.norm())
+```
+
+## Problem setup
+
+```python
+omega = 0.1  # energy splitting
+sx, sy, sz = sigmax(), sigmay(), sigmaz()
+
+Hd = 1 / 2 * omega * sz
+Hc = [sx, sy, sz]
+H = [Hd, Hc[0], Hc[1], Hc[2]]
+
+# objective for optimization
+initial_gate = qeye(2)
+target_gate = gates.hadamard_transform()
+
+times = np.linspace(0, np.pi / 2, 250)
+```
+
+## Guess
+
+```python
+guess_pulse_x = np.sin(times)
+guess_pulse_y = np.cos(times)
+guess_pulse_z = np.tanh(times)
+
+H_guess = [Hd, [Hc[0], guess_pulse_x], [Hc[1], guess_pulse_y], [Hc[2], guess_pulse_z]]
+evolution_guess = qt.sesolve(H_guess, initial_gate, times)
+
+print('Fidelity: ', fidelity(evolution_guess.states[-1], target_gate))
+
+plt.plot(times, [fidelity(gate, initial_gate) for gate in evolution_guess.states], label="Overlap with initial gate")
+plt.plot(times, [fidelity(gate, target_gate) for gate in evolution_guess.states], label="Overlap with target gate")
+plt.title("Guess performance")
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## GRAPE algorithm
+
+```python
+control_params = {
+    "ctrl_x": {"guess": np.sin(times), "bounds": [-1, 1]},
+    "ctrl_y": {"guess": np.cos(times), "bounds": [-1, 1]},
+    "ctrl_z": {"guess": np.tanh(times), "bounds": [-1, 1]},
+}
+
+res_grape = optimize_pulses(
+    objectives = Objective(initial_gate, H, target_gate),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "GRAPE",
+        "fid_err_targ": 0.001
+    },
+)
+
+print('Infidelity: ', res_grape.infidelity)
+
+plt.plot(times, guess_pulse_x, 'b--', label='guess pulse sx')
+plt.plot(times, res_grape.optimized_controls[0], 'b', label='optimized pulse sx')
+plt.plot(times, guess_pulse_y, 'g--', label='guess pulse sy')
+plt.plot(times, res_grape.optimized_controls[1], 'g', label='optimized pulse sy')
+plt.plot(times, guess_pulse_z, 'r--', label='guess pulse sz')
+plt.plot(times, res_grape.optimized_controls[2], 'r', label='optimized pulse sz')
+plt.title('GRAPE pulses')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+H_result = [Hd, [Hc[0], res_grape.optimized_controls[0]], [Hc[1], res_grape.optimized_controls[1]], [Hc[2], res_grape.optimized_controls[2]]]
+evolution = qt.sesolve(H_result, initial_gate, times)
+
+plt.plot(times, [fidelity(gate, initial_gate) for gate in evolution.states], label="Overlap with initial gate")
+plt.plot(times, [fidelity(gate, target_gate) for gate in evolution.states], label="Overlap with target gate")
+
+plt.title('GRAPE performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Validation
+
+```python
+assert res_grape.infidelity < 0.001
+assert fidelity(evolution.states[-1], target_gate) > 1-0.001
+```
+
+```python
+qt.about()
+```
diff --git a/tests/interactive/GRAPE_state_closed.md b/tests/interactive/GRAPE_state_closed.md
new file mode 100644
index 0000000..2681df7
--- /dev/null
+++ b/tests/interactive/GRAPE_state_closed.md
@@ -0,0 +1,114 @@
+---
+jupyter:
+  jupytext:
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.3'
+      jupytext_version: 1.17.1
+  kernelspec:
+    display_name: Python 3 (ipykernel)
+    language: python
+    name: python3
+---
+
+# GRAPE algorithm for 2 level system
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+from qutip import basis, Qobj
+import qutip as qt
+from qutip_qoc import Objective, optimize_pulses
+```
+
+## Problem setup
+
+```python
+# Energy levels
+E1, E2 = 1.0, 2.0
+
+Hd = Qobj(np.diag([E1, E2]))
+Hc = Qobj(np.array([
+    [0, 1],
+    [1, 0]
+]))
+H = [Hd, Hc]
+
+initial_state = basis(2, 0)  # |1>
+target_state = basis(2, 1)   # |2>
+
+times = np.linspace(0, 2 * np.pi, 250)
+```
+
+## Guess
+
+```python
+guess_pulse = np.sin(times)
+
+H_guess = [Hd, [Hc, guess_pulse]]
+evolution_guess = qt.sesolve(H_guess, initial_state, times)
+
+print('Fidelity: ', qt.fidelity(evolution_guess.states[-1], target_state))
+
+plt.plot(times, [np.abs(state.overlap(initial_state)) for state in evolution_guess.states], label="Overlap with initial state")
+plt.plot(times, [np.abs(state.overlap(target_state)) for state in evolution_guess.states], label="Overlap with target state")
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution_guess.states], '--', label="Fidelity")
+plt.title("Guess performance")
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## GRAPE algorithm
+
+```python
+control_params = {
+    "ctrl_x": {"guess": guess_pulse, "bounds": [-1, 1]},  # Control pulse for Hc1
+}
+
+res_grape = optimize_pulses(
+    objectives = Objective(initial_state, H, target_state),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "GRAPE",
+        "fid_err_targ": 0.001
+    },
+)
+
+print('Infidelity: ', res_grape.infidelity)
+
+plt.plot(times, guess_pulse, label='initial guess')
+plt.plot(times, res_grape.optimized_controls[0], label='optimized pulse')
+plt.title('GRAPE pulses')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+H_result = [Hd, [Hc, np.array(res_grape.optimized_controls[0])]]
+evolution = qt.sesolve(H_result, initial_state, times)
+
+plt.plot(times, [np.abs(state.overlap(initial_state)) for state in evolution.states], label="Overlap with initial state")
+plt.plot(times, [np.abs(state.overlap(target_state)) for state in evolution.states], label="Overlap with target state")
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution.states], '--', label="Fidelity")
+
+plt.title('GRAPE performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Validation
+
+```python
+assert res_grape.infidelity < 0.01
+assert np.abs(evolution.states[-1].overlap(target_state)) > 1-0.01
+```
+
+```python
+qt.about()
+```
diff --git a/tests/interactive/JOPT_gate_closed.md b/tests/interactive/JOPT_gate_closed.md
new file mode 100644
index 0000000..1f3370c
--- /dev/null
+++ b/tests/interactive/JOPT_gate_closed.md
@@ -0,0 +1,285 @@
+---
+jupyter:
+  jupytext:
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.3'
+      jupytext_version: 1.17.1
+  kernelspec:
+    display_name: Python 3 (ipykernel)
+    language: python
+    name: python3
+---
+
+# JOPT algorithm for a closed system
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+from qutip import gates, qeye, sigmax, sigmay, sigmaz
+import qutip as qt
+from qutip_qoc import Objective, optimize_pulses
+
+try:
+    from jax import jit
+    from jax import numpy as jnp
+except ImportError:  # JAX not available, skip test
+    import pytest
+    pytest.skip("JAX not available")
+
+def fidelity(gate, target_gate):
+    """
+    Fidelity used for unitary gates in qutip-qtrl and qutip-qoc
+    """
+    return np.abs(gate.overlap(target_gate) / target_gate.norm())
+```
+
+## Problem setup
+
+```python
+omega = 0.1  # energy splitting
+sx, sy, sz = sigmax(), sigmay(), sigmaz()
+
+Hd = 1 / 2 * omega * sz
+Hc = [sx, sy, sz]
+
+# objective for optimization
+initial_gate = qeye(2)
+target_gate = gates.hadamard_transform()
+
+times = np.linspace(0, np.pi / 2, 250)
+```
+
+## Guess
+
+```python
+jopt_guess = [1, 1]
+guess_pulse = jopt_guess[0] * np.sin(jopt_guess[1] * times)
+
+H_guess = [Hd] + [[hc, guess_pulse] for hc in Hc]
+evolution_guess = qt.sesolve(H_guess, initial_gate, times)
+
+print('Fidelity: ', fidelity(evolution_guess.states[-1], target_gate))
+
+plt.plot(times, [fidelity(gate, initial_gate) for gate in evolution_guess.states], label="Overlap with initial gate")
+plt.plot(times, [fidelity(gate, target_gate) for gate in evolution_guess.states], label="Overlap with target gate")
+plt.title("Guess performance")
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## JOPT algorithm
+
+```python
+@jit
+def sin_x(t, c, **kwargs):
+    return c[0] * jnp.sin(c[1] * t)
+
+H = [Hd] + [[hc, sin_x] for hc in Hc]
+```
+
+### a) not optimized over time
+
+```python
+control_params = {
+    id: {"guess": jopt_guess, "bounds": [(-1, 1), (0, 2 * np.pi)]}  # c0 and c1
+    for id in ['x', 'y', 'z']
+}
+
+res_jopt = optimize_pulses(
+    objectives = Objective(initial_gate, H, target_gate),
+    control_parameters = control_params,
+    tlist = times,
+    minimizer_kwargs = {
+        "method": "Nelder-Mead",
+    },
+    algorithm_kwargs={
+        "alg": "JOPT",
+        "fid_err_targ": 0.001,
+    },
+)
+
+print('Infidelity: ', res_jopt.infidelity)
+
+plt.plot(times, guess_pulse, 'k--', label='guess pulse sx, sy, sz')
+plt.plot(times, res_jopt.optimized_controls[0], 'b', label='optimized pulse sx')
+plt.plot(times, res_jopt.optimized_controls[1], 'g', label='optimized pulse sy')
+plt.plot(times, res_jopt.optimized_controls[2], 'r', label='optimized pulse sz')
+plt.title('JOPT pulses')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+H_result = [Hd,
+            [Hc[0], np.array(res_jopt.optimized_controls[0])],
+            [Hc[1], np.array(res_jopt.optimized_controls[1])],
+            [Hc[2], np.array(res_jopt.optimized_controls[2])]]
+evolution = qt.sesolve(H_result, initial_gate, times)
+
+plt.plot(times, [fidelity(gate, initial_gate) for gate in evolution.states], label="Overlap with initial gate")
+plt.plot(times, [fidelity(gate, target_gate) for gate in evolution.states], label="Overlap with target gate")
+
+plt.title('JOPT performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+### b) optimized over time
+
+```python
+# treats time as optimization variable
+control_params["__time__"] = {
+    "guess": times[len(times) // 2],
+    "bounds": [times[0], times[-1]],
+}
+
+# run the optimization
+res_jopt_time = optimize_pulses(
+    objectives = Objective(initial_gate, H, target_gate),
+    control_parameters = control_params,
+    tlist = times,
+    minimizer_kwargs = {
+        "method": "Nelder-Mead",
+    },
+    algorithm_kwargs={
+        "alg": "JOPT",
+        "fid_err_targ": 0.001,
+    },
+)
+
+opt_time = res_jopt_time.optimized_params[-1][0]
+time_range = times < opt_time
+
+print('Infidelity: ', res_jopt_time.infidelity)
+print('Optimized time: ', opt_time)
+
+plt.plot(times, guess_pulse, 'k--', label='guess pulse sx, sy, sz')
+plt.plot(times[time_range], np.array(res_jopt_time.optimized_controls[0])[time_range], 'b', label='optimized pulse sx')
+plt.plot(times[time_range], np.array(res_jopt_time.optimized_controls[1])[time_range], 'g', label='optimized pulse sy')
+plt.plot(times[time_range], np.array(res_jopt_time.optimized_controls[2])[time_range], 'r', label='optimized pulse sz')
+plt.title('JOPT pulses (time optimization)')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+times2 = times[time_range]
+if opt_time not in times2:
+    times2 = np.append(times2, opt_time)
+
+H_result = qt.QobjEvo(
+    [Hd, [Hc[0], np.array(res_jopt_time.optimized_controls[0])],
+         [Hc[1], np.array(res_jopt_time.optimized_controls[1])], 
+         [Hc[2], np.array(res_jopt_time.optimized_controls[2])]], tlist=times)
+evolution_time = qt.sesolve(H_result, initial_gate, times2)
+
+plt.plot(times2, [fidelity(gate, initial_gate) for gate in evolution_time.states], label="Overlap with initial gate")
+plt.plot(times2, [fidelity(gate, target_gate) for gate in evolution_time.states], label="Overlap with target gate")
+
+plt.title('JOPT (optimized over time) performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Global optimization
+
+```python
+res_jopt_global = optimize_pulses(
+    objectives = Objective(initial_gate, H, target_gate),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "JOPT",
+        "fid_err_targ": 0.001,
+    },
+    optimizer_kwargs = {
+       "method": "basinhopping",
+       "max_iter": 100,
+    }
+)
+
+global_time = res_jopt_global.optimized_params[-1][0]
+global_range = times < global_time
+
+print('Infidelity: ', res_jopt_global.infidelity)
+print('Optimized time: ', global_time)
+
+plt.plot(times, guess_pulse, 'k--', label='guess pulse sx, sy, sz')
+plt.plot(times[global_range], np.array(res_jopt_global.optimized_controls[0])[global_range], 'b', label='optimized pulse sx')
+plt.plot(times[global_range], np.array(res_jopt_global.optimized_controls[1])[global_range], 'g', label='optimized pulse sy')
+plt.plot(times[global_range], np.array(res_jopt_global.optimized_controls[2])[global_range], 'r', label='optimized pulse sz')
+plt.title('JOPT pulses (global optimization)')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+times3 = times[global_range]
+if global_time not in times3:
+    times3 = np.append(times3, global_time)
+
+H_result = qt.QobjEvo(
+    [Hd, [Hc[0], np.array(res_jopt_global.optimized_controls[0])],
+         [Hc[1], np.array(res_jopt_global.optimized_controls[1])], 
+         [Hc[2], np.array(res_jopt_global.optimized_controls[2])]], tlist=times)
+evolution_global = qt.sesolve(H_result, initial_gate, times3)
+
+plt.plot(times3, [fidelity(gate, initial_gate) for gate in evolution_global.states], label="Overlap with initial gate")
+plt.plot(times3, [fidelity(gate, target_gate) for gate in evolution_global.states], label="Overlap with target gate")
+
+plt.title('JOPT (global optimization) performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Comparison
+
+```python
+fig, axes = plt.subplots(1, 3, figsize=(18, 4))  # 1 row, 3 columns
+
+titles = ["JOPT sx pulses", "JOPT sy pulses", "JOPT sz pulses"]
+
+for i, ax in enumerate(axes):
+    ax.plot(times, guess_pulse, label='initial guess')
+    ax.plot(times, res_jopt.optimized_controls[i], label='optimized pulse')
+    ax.plot(times[time_range], np.array(res_jopt_time.optimized_controls[i])[time_range], label='optimized (over time) pulse')
+    ax.plot(times[global_range], np.array(res_jopt_global.optimized_controls[i])[global_range], label='global optimized pulse')
+    ax.set_title(titles[i])
+    ax.set_xlabel('time')
+    ax.set_ylabel('Pulse amplitude')
+    ax.legend()
+
+plt.tight_layout()
+plt.show()
+```
+
+## Validation
+
+```python
+assert res_jopt.infidelity < 0.001
+assert fidelity(evolution.states[-1], target_gate) > 1-0.001
+
+assert res_jopt_time.infidelity < 0.001
+assert fidelity(evolution_time.states[-1], target_gate) > 1-0.001
+
+assert res_jopt_global.infidelity < 0.001
+assert fidelity(evolution_global.states[-1], target_gate) > 1-0.001
+```
+
+```python
+qt.about()
+```
+
+
diff --git a/tests/interactive/JOPT_state_closed.md b/tests/interactive/JOPT_state_closed.md
new file mode 100644
index 0000000..11547cb
--- /dev/null
+++ b/tests/interactive/JOPT_state_closed.md
@@ -0,0 +1,296 @@
+---
+jupyter:
+  jupytext:
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.3'
+      jupytext_version: 1.17.1
+  kernelspec:
+    display_name: Python 3 (ipykernel)
+    language: python
+    name: python3
+---
+
+# JOPT algorithm for a 2 level system
+
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+from qutip import basis, Qobj
+import qutip as qt
+from qutip_qoc import Objective, optimize_pulses
+
+try:
+    from jax import jit
+    from jax import numpy as jnp
+except ImportError:  # JAX not available, skip test
+    import pytest
+    pytest.skip("JAX not available")
+```
+
+## Problem setup
+
+```python
+# Energy levels
+E1, E2 = 1.0, 2.0
+
+Hd = Qobj(np.diag([E1, E2]))
+Hc = Qobj(np.array([
+    [0, 1],
+    [1, 0]
+]))
+H = [Hd, Hc]
+
+initial_state = basis(2, 0)  # |1>
+target_state = basis(2, 1)   # |2>
+
+times = np.linspace(0, 2 * np.pi, 250)
+```
+
+## Guess
+
+```python
+jopt_guess = [1, 0.5]
+guess_pulse = jopt_guess[0] * np.sin(jopt_guess[1] * times)
+
+H_guess = [Hd, [Hc, guess_pulse]]
+evolution_guess = qt.sesolve(H_guess, initial_state, times)
+
+print('Fidelity: ', qt.fidelity(evolution_guess.states[-1], target_state))
+
+plt.plot(times, [np.abs(state.overlap(initial_state)) for state in evolution_guess.states], label="Overlap with initial state")
+plt.plot(times, [np.abs(state.overlap(target_state)) for state in evolution_guess.states], label="Overlap with target state")
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution_guess.states], '--', label="Fidelity")
+plt.title("Guess performance")
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## JOPT algorithm
+
+```python
+@jit
+def sin(t, c, **kwargs):
+    return c[0] * jnp.sin(c[1] * t)
+
+H = [Hd] + [[Hc, sin]]
+```
+
+### a) not optimized over time
+
+```python
+control_params = {
+    "ctrl_x": {"guess": [1, 0], "bounds": [(-1, 1), (0, 2 * np.pi)]}  # c0 and c1
+}
+
+res_jopt = optimize_pulses(
+    objectives = Objective(initial_state, H, target_state),
+    control_parameters = control_params,
+    tlist = times,
+    minimizer_kwargs = {
+        "method": "Nelder-Mead",
+    },
+    algorithm_kwargs = {
+        "alg": "JOPT",
+        "fid_err_targ": 0.001,
+    },
+)
+
+print('Infidelity: ', res_jopt.infidelity)
+
+plt.plot(times, guess_pulse, label='initial guess')
+plt.plot(times, res_jopt.optimized_controls[0], label='optimized pulse')
+plt.title('JOPT pulses')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+H_result = [Hd, [Hc, np.array(res_jopt.optimized_controls[0])]]
+evolution = qt.sesolve(H_result, initial_state, times)
+
+plt.plot(times, [np.abs(state.overlap(initial_state)) for state in evolution.states], label="Overlap with initial state")
+plt.plot(times, [np.abs(state.overlap(target_state)) for state in evolution.states], label="Overlap with target state")
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution.states], '--', label="Fidelity")
+
+plt.title('JOPT performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+Here, JOPT is stuck in a local minimum and does not reach the desired fidelity.
+
+
+### b) optimized over time
+
+```python
+# treats time as optimization variable
+control_params["__time__"] = {
+    "guess": times[len(times) // 2],
+    "bounds": [times[0], times[-1]],
+}
+
+# run the optimization
+res_jopt_time = optimize_pulses(
+    objectives = Objective(initial_state, H, target_state),
+    control_parameters = control_params,
+    tlist = times,
+    minimizer_kwargs = {
+        "method": "Nelder-Mead",
+    },
+    algorithm_kwargs = {
+        "alg": "JOPT",
+        "fid_err_targ": 0.001,
+    },
+)
+
+opt_time = res_jopt_time.optimized_params[-1][0]
+time_range = times < opt_time
+
+print('Infidelity: ', res_jopt_time.infidelity)
+print('Optimized time: ', opt_time)
+
+plt.plot(times, guess_pulse, label='initial guess')
+plt.plot(times, res_jopt.optimized_controls[0], label='optimized pulse')
+plt.plot(times[time_range], np.array(res_jopt_time.optimized_controls[0])[time_range], label='optimized (over time) pulse')
+plt.title('JOPT pulses (time optimization)')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+times2 = times[time_range]
+if opt_time not in times2:
+    times2 = np.append(times2, opt_time)
+
+H_result = qt.QobjEvo([Hd, [Hc, np.array(res_jopt_time.optimized_controls[0])]], tlist=times)
+evolution_time = qt.sesolve(H_result, initial_state, times2)
+
+plt.plot(times2, [np.abs(state.overlap(initial_state)) for state in evolution_time.states], label="Overlap with initial state")
+plt.plot(times2, [np.abs(state.overlap(target_state)) for state in evolution_time.states], label="Overlap with target state")
+plt.plot(times2, [qt.fidelity(state, target_state) for state in evolution_time.states], '--', label="Fidelity")
+
+plt.title('JOPT (optimized over time) performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+JOPT is still stuck in a local minimum, but the fidelity has improved.
+
+
+### c) global optimization 
+
+```python
+res_jopt_global = optimize_pulses(
+    objectives = Objective(initial_state, H, target_state),
+    control_parameters = control_params,
+    tlist = times,
+    algorithm_kwargs = {
+        "alg": "JOPT",
+        "fid_err_targ": 0.001
+    },
+    optimizer_kwargs={
+       "method": "basinhopping",
+       "max_iter": 1000
+    }
+)
+
+global_time = res_jopt_global.optimized_params[-1][0]
+global_range = times < global_time
+
+print('Infidelity: ', res_jopt_global.infidelity)
+print('Optimized time: ', global_time)
+
+plt.plot(times, guess_pulse, label='initial guess')
+plt.plot(times, res_jopt.optimized_controls[0], label='optimized pulse')
+plt.plot(times[global_range], np.array(res_jopt_global.optimized_controls[0])[global_range], label='global optimized pulse')
+plt.title('JOPT pulses (global)')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+times3 = times[global_range]
+if global_time not in times3:
+    times3 = np.append(times3, global_time)
+
+H_result = qt.QobjEvo([Hd, [Hc, np.array(res_jopt_global.optimized_controls[0])]], tlist=times)
+evolution_global = qt.sesolve(H_result, initial_state, times3)
+
+plt.plot(times3, [np.abs(state.overlap(initial_state)) for state in evolution_global.states], label="Overlap with initial state")
+plt.plot(times3, [np.abs(state.overlap(target_state)) for state in evolution_global.states], label="Overlap with target state")
+plt.plot(times3, [qt.fidelity(state, target_state) for state in evolution_global.states], '--', label="Fidelity")
+
+plt.title('JOPT (global) performance')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Comparison
+
+```python
+plt.plot(times, guess_pulse, color='blue', label='initial guess')
+plt.plot(times, res_jopt.optimized_controls[0], color='orange', label='optimized pulse')
+plt.plot(times[time_range], np.array(res_jopt_time.optimized_controls[0])[time_range], 
+         color='green', label='optimized (over time) pulse')
+plt.plot(times[global_range], np.array(res_jopt_global.optimized_controls[0])[global_range], 
+         color='red', label='global optimized pulse')
+
+plt.title('JOPT pulses')
+plt.xlabel('Time')
+plt.ylabel('Pulse amplitude')
+plt.legend()
+plt.show()
+```
+
+```python
+print('Guess Fidelity: ', qt.fidelity(evolution_guess.states[-1], target_state))
+print('JOPT Fidelity: ', 1 - res_jopt.infidelity)
+print('Time Fidelity: ', 1 - res_jopt_time.infidelity)
+print('GLobal Fidelity: ', 1 - res_jopt_global.infidelity)
+
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution_guess.states], color='blue', label="Guess")
+plt.plot(times, [qt.fidelity(state, target_state) for state in evolution.states], color='orange', label="Goat")
+plt.plot(times2, [qt.fidelity(state, target_state) for state in evolution_time.states], 
+         color='green', label="Time")
+plt.plot(times3, [qt.fidelity(state, target_state) for state in evolution_global.states], 
+         color='red', label="Global")
+
+plt.title('Fidelities')
+plt.xlabel('Time')
+plt.legend()
+plt.show()
+```
+
+## Validation
+
+```python
+guess_fidelity = qt.fidelity(evolution_guess.states[-1], target_state)
+
+# target fidelity not reached in part a), check that it is better than the guess
+assert 1 - res_jopt.infidelity > guess_fidelity
+assert np.allclose(np.abs(evolution.states[-1].overlap(target_state)), 1 - res_jopt.infidelity, atol=1e-3)
+
+# target fidelity not reached in part b), check that it is better than part a)
+assert res_jopt_time.infidelity < res_jopt.infidelity
+assert np.allclose(np.abs(evolution_time.states[-1].overlap(target_state)), 1 - res_jopt_time.infidelity, atol=1e-3)
+
+assert res_jopt_global.infidelity < 0.001
+assert np.abs(evolution_global.states[-1].overlap(target_state)) > 1 - 0.001
+```
+
+```python
+qt.about()
+```
diff --git a/tests/interactive/about.md b/tests/interactive/about.md
new file mode 100644
index 0000000..9d22894
--- /dev/null
+++ b/tests/interactive/about.md
@@ -0,0 +1,21 @@
+This directory contains notebooks showcasing the use of all algorithms contained within `QuTiP-QOC`.
+The examples are chosen as simple as possible, for the purpose of demonstrating how to use `QuTiP-QOC` in all of these scenarios, and for the purpose of automatically testing `QuTiP-QOC`'s basic functionality in all of these scenarios.
+The included algorithms are:
+- GRAPE
+- CRAB
+- GOAT
+- JOPT
+
+For each algorithm, we have:
+- a closed-system state transfer example
+- an open-system state transfer example
+- a closed-system gate synthesis example
+- an open-system gate synthesis example
+
+The notebooks are included automatically in runs of the test suite (see `test_interactive.py`).
+
+To view and run the notebooks manually, the `jupytext` package is required.
+The notebooks can then either be opened from within Jupyter Notebook using "Open With" -> "Jupytext Notebook", or by converting them first to the `ipynb` format using
+```
+jupytext --to ipynb [notebook name].nb
+```
\ No newline at end of file
diff --git a/tests/test_interactive.py b/tests/test_interactive.py
new file mode 100644
index 0000000..0961db0
--- /dev/null
+++ b/tests/test_interactive.py
@@ -0,0 +1,32 @@
+"""
+This file contains the test suite for running the interactive test notebooks
+in the 'tests/interactive' directory.
+Taken, modified from https://github.com/SeldonIO/alibi/blob/master/testing/test_notebooks.py
+"""
+
+import glob
+import nbclient.exceptions
+import pytest
+
+from pathlib import Path
+from jupytext.cli import jupytext
+import nbclient
+
+# Set of all example notebooks
+NOTEBOOK_DIR = 'tests/interactive'
+ALL_NOTEBOOKS = {
+    Path(x).name
+    for x in glob.glob(str(Path(NOTEBOOK_DIR).joinpath('*.md')))
+    if Path(x).name != 'about.md'
+}
+
+@pytest.mark.parametrize("notebook", ALL_NOTEBOOKS)
+def test_notebook(notebook):
+    notebook = Path(NOTEBOOK_DIR, notebook)
+    try:
+        jupytext(args=[str(notebook), "--execute"])
+    except nbclient.exceptions.CellExecutionError as e:
+        if e.ename == "Skipped":
+            pytest.skip(e.evalue)
+        else:
+            raise e
\ No newline at end of file

From bc385c2744207c296235b79c8c00c9ad46e0403b Mon Sep 17 00:00:00 2001
From: Paul Menczel <paul@menczel.net>
Date: Thu, 21 Aug 2025 09:48:13 +0900
Subject: [PATCH 50/52] Added requirements for interactive tests to setup.cfg

---
 .github/workflows/test.yml | 2 +-
 setup.cfg                  | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b0272a6..14185e3 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -79,7 +79,7 @@ jobs:
 
       - name: Test with pytest and generate coverage report
         run: |
-          pip install jupytext nbconvert ipykernel pytest-cov coveralls
+          pip install pytest-cov coveralls
           pytest tests --strict-markers --cov=qutip_qoc --cov-report= --color=yes
 
       - name: Upload to Coveralls
diff --git a/setup.cfg b/setup.cfg
index 2547088..e21ca4d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -49,6 +49,9 @@ rl =
     stable-baselines3>=2.3.2
 tests =
     pytest>=6.0
+    jupytext
+    nbconvert
+    ipykernel
 full =
     %(jopt)s
     %(rl)s

From 896fcc8d9e0489839f78dcc7c552b8892e01b1d1 Mon Sep 17 00:00:00 2001
From: Veronika Kurth <veronika.kurth@posteo.de>
Date: Thu, 5 Mar 2026 10:34:38 +0900
Subject: [PATCH 51/52] Add file with family package info

---
 src/qutip_qoc/family.py | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 src/qutip_qoc/family.py

diff --git a/src/qutip_qoc/family.py b/src/qutip_qoc/family.py
new file mode 100644
index 0000000..d76645a
--- /dev/null
+++ b/src/qutip_qoc/family.py
@@ -0,0 +1,8 @@
+"""QuTiP family package entry point."""
+
+from . import __version__
+
+
+def version():
+    """Return information to include in qutip.about()."""
+    return "qutip-qoc", __version__

From ca2bc1b7764bfe476e8a639d7c92e83659018fd9 Mon Sep 17 00:00:00 2001
From: Veronika Kurth <veronika.kurth@posteo.de>
Date: Fri, 6 Mar 2026 11:23:04 +0900
Subject: [PATCH 52/52] Add entry points for family packages to setup.cfg

---
 setup.cfg | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index e21ca4d..04fe89b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -39,6 +39,10 @@ setup_requires =
 [options.packages.find]
 where = src
 
+[options.entry_points]
+qutip.family =
+    qutip_qoc = qutip_qoc.family
+
 [options.extras_require]
 jopt =
     jax