PaddlePaddle
diff --git a/‎examples/Baselines/Halite_competition/paddle/README.md‎
Lines changed: 62 additions & 0 deletions b/‎examples/Baselines/Halite_competition/paddle/README.md‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎examples/Baselines/Halite_competition/paddle/config.py‎
Lines changed: 50 additions & 0 deletions b/‎examples/Baselines/Halite_competition/paddle/config.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎examples/Baselines/Halite_competition/paddle/encode_model.py‎
Lines changed: 29 additions & 0 deletions b/‎examples/Baselines/Halite_competition/paddle/encode_model.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎examples/Baselines/Halite_competition/paddle/model/latest_ship_model.pth‎
325 KB b/‎examples/Baselines/Halite_competition/paddle/model/latest_ship_model.pth‎
325 KB
diff --git a/‎examples/Baselines/Halite_competition/paddle/requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎examples/Baselines/Halite_competition/paddle/requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/Baselines/Halite_competition/paddle/rl_trainer/agent.py‎
Lines changed: 129 additions & 0 deletions b/‎examples/Baselines/Halite_competition/paddle/rl_trainer/agent.py‎
Lines changed: 129 additions & 0 deletions
@@ -0,0 +1,62 @@
+# 浦发赛事强化学习基线方案
+基于PARL框架，我们提供了一个PPO算法的基线方案
+
+## 目录
+* config.py : 参数配置
+* train.py : 训练脚本
+* test.py : 测试脚本
+* submission.py : 提交示例
+* rl_trainer
+  * model.py : 定义 actor 和 critic 网络架构
+  * agent.py : 负责算法和环境交互，包括将数据提供给算法训练
+  * algorithm.py : PPO算法实现
+  * controller.py : 跟踪每艘飞船的状态，设计奖励并收集训练数据
+  * policy.py : 定义基于规则的策略以用于控制基地
+  * obs_parser.py : 设计每艘飞船的状态
+  
+
+## 基线设计
+我们使用PPO算法来控制每艘飞船，其中所有飞船都共享同一个模型参数。
+每艘飞船的目标是尽可能快地收集K个单位的砂金(K为超参数)，飞船采集完成后则返航到基地，将砂金放置基地后则开始新一轮的采集过程，另外当交互过程即将结束(达到最大的交互步数)，飞船也会被强制返航。换而言之，飞船采集的过程是由模型来控制，其余过程则由规则控制。对于基地，我们则使用规则来控制，基地的目标是尽可能快地生产M艘飞船(M为超参数)。
+
+
+## 快速开始
+创建并激活一个虚拟python环境
+```shell
+conda create -n halite python==3.6
+
+source activate halite
+```
+
+安装依赖
+```shell
+pip install -r requirements.txt
+```
+
+## 训练
+在 config.py 文件中修改超参数后并运行以下命令:
+```shell
+python train.py
+```
+
+## 测试
+当训练完成后，在 test.py 中修改你的模型加载路径后运行脚本来测试你的模型效果。
+```shell
+python test.py
+```
+
+需要注意的是，此测试脚本使用了一个内置的 random agent 作为对手。如果你需要对比其他智能体的话则需要修改 “random” 为对应的智能体方法。当你提交模型和方案到平台前，也可以使用此脚本来测试你的代码中是否无误。
+
+
+## 结果
+以下图片展示了PPO算法的学习效果。目前，我们只在某个固定种子下训练模型，并且选取了一个随机智能体作为对手。为了在赛事中得到较好的名次，选手应该训练出一个更为鲁棒的模型(如应对不同砂金分布的环境和 1vs1, 1vs3场景)。
+![learning curve](https://github.com/benchmarking-rl/PARL-experiments/blob/master/Baselines/Halite_Competition/paddle/learning_curve.jpg?raw=true)
+
+## 可视化
+如果你想查看经渲染后的对战效果，首先需要激活Jupyter Notebook环境并打开test.ipynb，随后运行其中代码即可看到动画效果。
+![animation](https://github.com/benchmarking-rl/PARL-experiments/blob/master/Baselines/Halite_Competition/paddle/animation.gif?raw=true)
+
+## 提交
+目前选手们只能提交一个文件到平台上，因此选手需要将需要用到的函数和模型都放置到同一个文件中。为了在文件中加载模型，选手需要先将模型编码成字节串然后放到文件中，在需要加载模型的地方将字节串解码。选手可以参考 encode_model.py 查看如何编码模型，参考 submission.py 文件查看提交范例和加载模型。
+
+需要注意的是，评分系统只会调用提交文件的最后一个函数方法。因此选手需要将智能体给出动作的函数方法放在提交文件的最后，此方法接收 observation 和 configuration作为输入，给出每艘飞船和基地的动作，具体选手可查看 submission.py 文件。
@@ -0,0 +1,50 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+config = {
+
+    # configuration for env
+    "board_size": 21,
+
+    # configuration for training
+    "episodes": 100000,
+    "batch_size": 128,
+    "train_times": 2,
+    "gamma": 0.997,
+    "lr": 0.0001,
+    "test_every_episode": 100,
+
+    # configuration for ppo algorithm
+    "vf_loss_coef": 1,
+    "ent_coef": 0.01,
+
+    # configuration for the observation of ships
+    "world_dim": 5 * 21 * 21,
+    "ship_obs_dim": 6,
+    "ship_act_dim": 5,
+    "ship_max_step": 10000,
+
+    # the number of halite we want the ships to obtain (e.g K)
+    "num_halite": 100,
+
+    # the maximum number of ships (e.g M)
+    "num_ships": 10,
+
+    # seed for training
+    "seed": 5609,
+
+    # configuration for logging
+    "log_path": './train_log/',
+    "save_path": './save_model/',
+}
@@ -0,0 +1,29 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import pickle
+import paddle
+
+if __name__ == '__main__':
+
+    model = paddle.load('./model/latest_ship_model.pth')
+    actor = model['actor']
+
+    for name, param in actor.items():
+        actor[name] = param.numpy()
+
+    model_byte = base64.b64encode(pickle.dumps(actor))
+    with open('./model/actor.txt', 'wb') as f:
+        f.write(model_byte)
@@ -0,0 +1,2 @@
+parl>=2.0.0
+paddlepaddle>=2.0.0
@@ -0,0 +1,129 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import parl
+import paddle
+
+
+class Agent(parl.Agent):
+    """Agent.
+    Args:
+        algorithm (`parl.Algorithm`): algorithm to be used in this agent.
+    """
+
+    def __init__(self, algorithm):
+
+        self.alg = algorithm
+
+    def learn(self, obs, act, value, returns, log_prob, adv):
+        """Updating network
+        Args:
+            obs (np.array): representation of current observation
+            act (np.array): current action
+            value (np.array): state value
+            returns (np.array): discounted return
+            log_prob (np.array): the log probabilities of action
+            adv (np.array): advantage value
+        """
+
+        obs = paddle.to_tensor(obs, dtype=paddle.float32)
+        act = paddle.to_tensor(act, dtype=paddle.int32)
+        value = paddle.to_tensor(value, dtype=paddle.float32)
+        returns = paddle.to_tensor(returns, dtype=paddle.float32)
+        log_prob = paddle.to_tensor(log_prob, dtype=paddle.float32)
+        adv = paddle.to_tensor(adv, dtype=paddle.float32)
+
+        value_loss, action_loss, entropy = self.alg.learn(
+            obs, act, value, returns, log_prob, adv)
+
+        return value_loss, action_loss, entropy
+
+    def predict(self, state):
+        """Predict action
+        Args:
+            state (np.array): representation of current state 
+
+        Return:
+            action (np.array): action to be executed
+        """
+
+        state_tensor = paddle.to_tensor(state, dtype=paddle.float32)
+
+        with paddle.no_grad():
+
+            action = self.alg.predict(state_tensor).cpu().numpy()
+
+        return action
+
+    def sample(self, state):
+        """Sampling action
+        Args:
+            state (np.array): representation of current state 
+        Return:
+            action (np.array): action to be executed
+        """
+
+        state_tensor = paddle.to_tensor(state, dtype=paddle.float32)
+
+        with paddle.no_grad():
+
+            value, action, action_log_prob = self.alg.sample(state_tensor)
+
+        value = value.detach().cpu().numpy().flatten()
+        action = action.detach().cpu().numpy()
+        action_log_prob = action_log_prob.cpu().numpy()
+
+        return value, action, action_log_prob
+
+    def value(self, state):
+        """Predict the critic value
+        Args:
+            state (np.array): representation of current state 
+        Return:
+            value (np.array): state value
+        """
+
+        state_tensor = paddle.to_tensor(state, dtype=paddle.float32)
+
+        with paddle.no_grad():
+
+            value = self.alg.value(state_tensor).cpu().numpy()
+
+        return value
+
+    def save(self, model_path):
+        """Save Model
+        Args:
+            model_path (str): the path to save model
+        """
+        sep = os.sep
+        dirname = sep.join(model_path.split(sep)[:-1])
+        if dirname != '' and not os.path.exists(dirname):
+            os.makedirs(dirname)
+        model_dict = {}
+        model_dict["critic"] = self.alg.critic.state_dict()
+        model_dict["actor"] = self.alg.actor.state_dict()
+        model_dict["optim"] = self.alg.optim.state_dict()
+        paddle.save(model_dict, model_path)
+
+    def restore(self, model_path):
+        """Restore model
+        Args:
+            model_path (str): the path to restore model
+        """
+        model_dict = paddle.load(model_path)
+        self.alg.critic.set_state_dict(model_dict["critic"])
+        self.alg.actor.set_state_dict(model_dict["actor"])
+        self.alg.optim.set_state_dict(model_dict["optim"])