werner-duvaud
diff --git a/‎.github/ISSUE_TEMPLATE/bug-report.yml‎
Lines changed: 69 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/bug-report.yml‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/feature-request.yml‎
Lines changed: 38 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/feature-request.yml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎.github/workflows/ci-testing.yaml‎
Lines changed: 62 additions & 0 deletions b/‎.github/workflows/ci-testing.yaml‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 12 additions & 4 deletions b/‎README.md‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎diagnose_model.py‎
Lines changed: 1 addition & 1 deletion b/‎diagnose_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎games/abstract_game.py‎
Lines changed: 5 additions & 5 deletions b/‎games/abstract_game.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎games/atari.py‎
Lines changed: 8 additions & 8 deletions b/‎games/atari.py‎
Lines changed: 8 additions & 8 deletions
@@ -0,0 +1,69 @@
+name: 🐛 Bug Report
+
+description: Create a report to help us reproduce and fix the bug
+labels: [bug]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thank you for submitting a MuZero 🐛 Bug Report!
+
+  - type: checkboxes
+    attributes:
+      label: Search before asking
+      description: >
+        Please search the [issues](https://github.com/werner-duvaud/muzero-general/issues) to see if a similar bug report already exists.
+      options:
+        - label: >
+            I have searched the MuZero [issues](https://github.com/werner-duvaud/muzero-general/issues) and found no similar bug report.
+          required: true
+
+  - type: textarea
+    attributes:
+      label: 🐛 Describe the bug
+      description: |
+        Please provide a clear and concise description of what the bug is.
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Add an example
+      description: Provide console output with error messages and/or screenshots of the bug.
+      placeholder: |
+        💡 ProTip! Include as much information as possible (screenshots, logs, tracebacks etc.) to receive the most helpful response.
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Environment
+      description: Please specify the software and hardware you used to produce the bug.
+      placeholder: |
+        For example:
+        - torch 1.9.0+cu111 CUDA:0 (A100-SXM4-40GB, 40536MiB)
+        - OS: Ubuntu 20.04
+        - Python: 3.9.0
+    validations:
+      required: false
+
+  - type: textarea
+    attributes:
+      label: Minimal Reproducible Example
+      description: >
+        This code will help us reproducing the issue so we can track the source of it.
+      placeholder: |
+        ```
+        # Code to reproduce the issue here
+        ```
+    validations:
+      required: false
+
+  - type: textarea
+    attributes:
+      label: Additional
+      description: Anything else you would like to share?
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for contributing 🎉!
@@ -0,0 +1,38 @@
+name: 🚀 Feature Request
+
+description: Submit a request for a new MuZero feature
+labels: [enhancement]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thank you for submitting a MuZero 🚀 Feature Request!
+
+  - type: checkboxes
+    attributes:
+      label: Search before asking
+      description: >
+        Please search the [issues](https://github.com/werner-duvaud/muzero-general/issues) to see if a similar feature request already exists.
+      options:
+        - label: >
+            I have searched the MuZero [issues](https://github.com/werner-duvaud/muzero-general/issues) and found no similar feature requests.
+          required: true
+
+  - type: textarea
+    attributes:
+      label: Description
+      description: A short description of your feature.
+      placeholder: |
+        What new feature would you like to see in MuZero?
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Additional context
+      description: >
+        Add any other context or screenshots about the feature request.
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for contributing 🎉!
@@ -0,0 +1,62 @@
+name: CI testing
+
+on: [push, pull_request]
+
+jobs:
+  training-test:
+    runs-on: ${{matrix.os}}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.7]
+
+    timeout-minutes: 90
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Set up Python ${{matrix.python-version}}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{matrix.python-version}}
+
+      - name: Install black
+        run: "pip install black"
+
+      - name: Run black
+        run: "black --check --diff ."
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Training test
+        shell: bash
+        run: |
+          # Launch cartpole experiment and store the last reward of the training
+          python muzero.py cartpole '{"training_steps": 7500}' 2>&1 | tee log.txt
+
+      - name: Archive log artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: training_test_logs
+          path: log.txt
+
+      - name: Retrieve training test log
+        uses: actions/download-artifact@v2
+        with:
+          name: training_test_logs
+
+      - name: Check reward
+        shell: bash
+        run: |
+          # Retrieve last reward
+          BEST_REWARD=$(cat log.txt | sed -n -E 's/^.*reward: ([0-9]+).*$/\1/p' | sort -n | tail -1)
+
+          # Display best reward
+          echo "Best reward of cartpole training: " $BEST_REWARD
+
+          # Validate reward value
+          if ((BEST_REWARD < 250)); then
+            exit 1
+          fi
@@ -5,10 +5,14 @@
 ![license MIT](https://img.shields.io/badge/licence-MIT-green)
 [![discord badge](https://img.shields.io/badge/discord-join-6E60EF)](https://discord.gg/GB2vwsF)
 
+![ci-testing workflow](https://github.com/werner-duvaud/muzero-general/workflows/CI%20testing/badge.svg)
+
 # MuZero General
 
-A commented and [documented](https://github.com/werner-duvaud/muzero-general/wiki/MuZero-Documentation) implementation of MuZero based on the Google DeepMind [paper](https://arxiv.org/abs/1911.08265) (Nov 2019) and the associated [pseudocode](https://arxiv.org/src/1911.08265v2/anc/pseudocode.py).
+A commented and [documented](https://github.com/werner-duvaud/muzero-general/wiki/MuZero-Documentation) implementation of MuZero based on the Google DeepMind [paper](https://arxiv.org/abs/1911.08265) (Schrittwieser et al., Nov 2019) and the associated [pseudocode](https://arxiv.org/src/1911.08265v2/anc/pseudocode.py).
 It is designed to be easily adaptable for every games or reinforcement learning environments (like [gym](https://github.com/openai/gym)). You only need to add a [game file](https://github.com/werner-duvaud/muzero-general/tree/master/games) with the hyperparameters and the game class. Please refer to the [documentation](https://github.com/werner-duvaud/muzero-general/wiki/MuZero-Documentation) and the [example](https://github.com/werner-duvaud/muzero-general/blob/master/games/cartpole.py).
+This implementation is primarily for educational purpose.\
+[Explanatory video of MuZero](https://youtu.be/We20YSAJZSE)
 
 MuZero is a state of the art RL algorithm for board games (Chess, Go, ...) and Atari games.
 It is the successor to [AlphaZero](https://arxiv.org/abs/1712.01815) but without any knowledge of the environment underlying dynamics. MuZero learns a model of the environment and uses an internal representation that contains only the useful information for predicting the reward, value, policy and transitions. MuZero is also close to [Value prediction networks](https://arxiv.org/abs/1707.03497). See [How it works](https://github.com/werner-duvaud/muzero-general/wiki/How-MuZero-works).
@@ -28,14 +32,13 @@ It is the successor to [AlphaZero](https://arxiv.org/abs/1712.01815) but without
 * [ ] Windows support (Experimental / Workaround: Use the [notebook](https://github.com/werner-duvaud/muzero-general/blob/master/notebook.ipynb) in [Google Colab](https://colab.research.google.com))
 
 ### Further improvements
-These improvements are active research, they are personal ideas and go beyond MuZero paper. We are open to contributions and other ideas.
+Here is a list of features which could be interesting to add but which are not in MuZero's paper. We are open to contributions and other ideas.
 
 * [x] [Hyperparameter search](https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization)
 * [x] [Continuous action space](https://github.com/werner-duvaud/muzero-general/tree/continuous)
 * [x] [Tool to understand the learned model](https://github.com/werner-duvaud/muzero-general/blob/master/diagnose_model.py)
-* [ ] Support of stochastic environments
+* [ ] Batch MCTS
 * [ ] Support of more than two player games
-* [ ] RL tricks (Never Give Up,  Adaptive Exploration, ...)
 
 ## Demo
 
@@ -96,6 +99,11 @@ tensorboard --logdir ./results
 
 You can adapt the configurations of each game by editing the `MuZeroConfig` class of the respective file in the [games folder](https://github.com/werner-duvaud/muzero-general/tree/master/games).
 
+## Related work
+
+* [EfficientZero](https://arxiv.org/abs/2111.00210) (Weirui Ye, Shaohuai Liu, Thanard Kurutach, Pieter Abbeel, Yang Gao)
+* [Sampled MuZero](https://arxiv.org/abs/2104.06303) (Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Mohammadamin Barekatain, Simon Schmitt, David Silver)
+
 ## Authors
 
 * Werner Duvaud
 
@@ -364,4 +364,4 @@ def plot_trajectory(self):
         ax.set(ylabel="Timestep")
         ax.set_title(name)
 
-        plt.show(block=False)
+        plt.show(block=False)
@@ -14,7 +14,7 @@ def __init__(self, seed=None):
     def step(self, action):
         """
         Apply action to the game.
-        
+
         Args:
             action : action of the action_space to take.
 
@@ -28,7 +28,7 @@ def to_play(self):
         Return the current player.
 
         Returns:
-            The current player, it should be an element of the players list in the config. 
+            The current player, it should be an element of the players list in the config.
         """
         return 0
 
@@ -37,7 +37,7 @@ def legal_actions(self):
         """
         Should return the legal actions at each turn, if it is not available, it can return
         the whole action space. At each turn, the game have to be able to handle one of returned actions.
-        
+
         For complex game where calculating legal moves is too long, the idea is to define the legal actions
         equal to the action space but to return a negative reward if the action is illegal.
 
@@ -50,7 +50,7 @@ def legal_actions(self):
     def reset(self):
         """
         Reset the game for a new game.
-        
+
         Returns:
             Initial observation of the game.
         """
@@ -79,7 +79,7 @@ def human_to_action(self):
         """
         choice = input(f"Enter the action to play for the player {self.to_play()}: ")
         while int(choice) not in self.legal_actions():
-            choice = input("Ilegal action. Enter another action : ")
+            choice = input("Illegal action. Enter another action : ")
         return int(choice)
 
     def expert_agent(self):
 
@@ -1,5 +1,5 @@
 import datetime
-import os
+import pathlib
 
 import gym
 import numpy
@@ -15,6 +15,7 @@
 
 class MuZeroConfig:
     def __init__(self):
+        # fmt: off
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
@@ -78,7 +79,7 @@ def __init__(self):
 
 
         ### Training
-        self.results_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S"))  # Path to store the model weights and TensorBoard logs
+        self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
         self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
         self.training_steps = int(1000e3)  # Total number of training steps (ie weights update according to a batch)
         self.batch_size = 1024  # Number of parts of games to train on at each training step
@@ -114,7 +115,7 @@ def __init__(self):
         self.self_play_delay = 0  # Number of seconds to wait after each played game
         self.training_delay = 0  # Number of seconds to wait after each training step
         self.ratio = None  # Desired training steps per self played step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
-
+        # fmt: on
 
     def visit_softmax_temperature_fn(self, trained_steps):
         """
@@ -145,7 +146,7 @@ def __init__(self, seed=None):
     def step(self, action):
         """
         Apply action to the game.
-        
+
         Args:
             action : action of the action_space to take.
 
@@ -162,9 +163,9 @@ def legal_actions(self):
         """
         Should return the legal actions at each turn, if it is not available, it can return
         the whole action space. At each turn, the game have to be able to handle one of returned actions.
-        
+
         For complex game where calculating legal moves is too long, the idea is to define the legal actions
-        equal to the action space but to return a negative reward if the action is illegal.        
+        equal to the action space but to return a negative reward if the action is illegal.
 
         Returns:
             An array of integers, subset of the action space.
@@ -174,7 +175,7 @@ def legal_actions(self):
     def reset(self):
         """
         Reset the game for a new game.
-        
+
         Returns:
             Initial observation of the game.
         """
@@ -196,4 +197,3 @@ def render(self):
         """
         self.env.render()
         input("Press enter to take a step ")
-